CodeGen: Allow small copyable blocks to "break" the CFG.

When choosing the best successor for a block, ordinarily we would have preferred a block that preserves the CFG unless there is a strong probability the other direction. For small blocks that can be duplicated we now skip that requirement as well. Differential revision: https://reviews.llvm.org/D27742 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291609 91177308-0d34-0410-b5e6-96231b3b80d8
liuchyi · Jan 10, 2017 · ada6595 · ada6595
1 parent d8c5040
commit ada6595
Show file tree

Hide file tree

Showing 58 changed files with 472 additions and 212 deletions.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -403,6 +403,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  bool canTailDuplicateUnplacedPreds(
+      MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      BlockChain &Chain, const BlockFilterSet *BlockFilter);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -561,6 +565,45 @@ getAdjustedProbability(BranchProbability OrigProb,
   return SuccProb;
 }
 
+/// Check if a block should be tail duplicated.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB. In
+/// addition we keep a set of blocks that have been tail-duplicated into and
+/// allow those blocks to be unplaced as well. This allows the creation of a
+/// second (larger) spine and a short fallthrough spine.
+/// We also identify blocks with the CFG that would have been produced by
+/// tail-duplication and lay them out in the same manner.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
+    return false;
+
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred))
+      return false;
+  }
+  return true;
+}
+
 /// When the option OutlineOptionalBranches is on, this method
 /// checks if the fallthrough candidate block \p Succ (of block
 /// \p BB) also has other unscheduled predecessor blocks which
@@ -634,6 +677,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   if (SuccChain.UnscheduledPredecessors == 0)
     return false;
 
+  // As a heuristic, if we can duplicate the block into all its unscheduled
+  // predecessors, we return false.
+  if (TailDupPlacement
+      && canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter))
+    return false;
+
   // There are two basic scenarios here:
   // -------------------------------------
   // Case 1: triangular shape CFG (if-then):
@@ -1908,13 +1957,8 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   DuplicatedToLPred = false;
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
+
+  if (!shouldTailDuplicate(BB))
     return false;
   // This has to be a callback because none of it can be done after
   // BB is deleted.
@@ -1967,6 +2011,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 

diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
@@ -140,12 +140,17 @@ test4:
 
 test5:
 ; CHECK: cmn {{w[0-9]+}}, #444
-; CHECK: b.gt [[RET]]
+; CHECK: b.le [[TEST6:.?LBB[0-9]+_[0-9]+]]
   %newval5 = add i32 %val, 4
   store i32 %newval5, i32* @var_i32
   %cmp_neg_uge = icmp sgt i32 %val2, -444
   br i1 %cmp_neg_uge, label %ret, label %test6
 
+; CHECK: {{^}}[[RET]]:
+; CHECK: ret
+; CHECK: {{^}}[[TEST6]]:
+; CHECK: ret
+
 test6:
   %newval6 = add i32 %val, 5
   store i32 %newval6, i32* @var_i32

diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -9,10 +9,10 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -27,10 +27,12 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -41,15 +43,15 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_rel:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -64,10 +66,10 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val

diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -108,10 +108,10 @@ if.end:                                           ; preds = %if.then, %lor.lhs.f
 ; CHECK: cmp w0, #1
 ; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0
 ; CHECK: ccmp [[DIVRES]], #16, #0, ge
-; CHECK: b.gt [[BLOCK:LBB[0-9_]+]]
-; CHECK: bl _foo
-; CHECK: [[BLOCK]]:
+; CHECK: b.le [[BLOCK:LBB[0-9_]+]]
 ; CHECK: orr w0, wzr, #0x7
+; CHECK: [[BLOCK]]:
+; CHECK: bl _foo
 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -135,7 +135,7 @@ if.end:
 ; CHECK: cmp
 ; CHECK-NOT: b.
 ; CHECK: fccmp {{.*}}, #8, ge
-; CHECK: b.lt
+; CHECK: b.ge
 define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0

diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -346,19 +346,15 @@ entry:
 ; CHECK-NEXT: sub w1, w1, #1
 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
 ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
-; DISABLE-NEXT: b [[IFEND_LABEL]]
-;
-; DISABLE: [[ELSE_LABEL]]: ; %if.else
-; DISABLE: lsl w0, w1, #1
-;
-; CHECK: [[IFEND_LABEL]]:
+; CHECK-NEXT: [[IFEND_LABEL]]:
 ; Epilogue code.
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: ret
 ;
-; ENABLE: [[ELSE_LABEL]]: ; %if.else
-; ENABLE-NEXT: lsl w0, w1, #1
-; ENABLE_NEXT: ret
+; CHECK: [[ELSE_LABEL]]: ; %if.else
+; CHECK-NEXT: lsl w0, w1, #1
+; DISABLE-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
 define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
 entry:
   %ap = alloca i8*, align 8

diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
@@ -27,7 +27,7 @@ test4:
   %val4 = load volatile i64, i64* @var64
   %tst4 = icmp ne i64 %val4, 0
   br i1 %tst4, label %end, label %test5, !prof !1
-; CHECK: cbnz {{x[0-9]+}}, .LBB
+; CHECK: cbz {{x[0-9]+}}, .LBB
 
 test5:
   store volatile i64 %val4, i64* @var64

diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -210,7 +210,7 @@ test2:
 
 test3:
 ; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}, asr #12
-; CHECK: b.gt .L
+; CHECK: b.le .L
   %asr_op = ashr i64 %val2, 12
   %asr_and = and i64 %asr_op, %val1
   %tst3 = icmp sgt i64 %asr_and, 0

diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -10,7 +10,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -28,7 +28,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -118,7 +118,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -178,7 +178,7 @@ define void @test9(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -194,7 +194,7 @@ define void @test10(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -209,7 +209,7 @@ define void @test11(i64 %val1, i64* %ptr) {
 
 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
   %val = load i64, i64* %ptr
   %tst = icmp slt i64 %val, 0
@@ -229,7 +229,7 @@ define void @test12(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -247,7 +247,7 @@ define void @test13(i64 %val1, i64 %val2) {
 
 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -262,7 +262,7 @@ define void @test14(i1 %cond) {
   br i1 %cond, label %if.end, label %if.then
 
 ; CHECK-NOT: and
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()

diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -335,6 +335,12 @@ loop:
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN-NEXT: [[BB3]]: ; %bb3
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
 define void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
@@ -356,6 +362,12 @@ bb2:
   br label %bb3
 
 bb3:
+; These NOPs prevent tail-duplication-based outlining
+; from firing, which defeats the need to expand the branches and this test.
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
   ret void
 }
 
@@ -385,6 +397,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
+; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
@@ -402,6 +415,9 @@ if_uniform:
   br label %endif
 
 endif:
+  ; layout can remove the split branch if it can copy the return block.
+  ; This call makes the return block long enough that it doesn't get copied.
+  call void @llvm.amdgcn.s.sleep(i32 5);
   ret void
 }
 

diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -37,7 +37,10 @@ bb5:                                              ; preds = %bb3, %bb1
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
+; GCN: s_cbranch_scc0 [[BODY:BB[0-9]+_[0-9]+]]
+; GCN: s_endpgm
+
+; GCN: {{^}}[[BODY]]:
 ; GCN: s_endpgm
 ; GCN: .Lfunc_end1
 define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {