[BOLT] CDSplit main logic part 2/2 (llvm#74032)

This diff implements the main splitting logic of CDSplit. CDSplit processes functions in a binary in parallel. For each function BF, it assumes that all other functions are hot-cold split. For each possible hot-warm split point of BF, it computes its corresponding SplitScore, and chooses the split point with the best SplitScore. The SplitScore of each split point is computed in the following way: each call edge or jump edge has an edge score that is proportional to its execution count, and inversely proportional to its distance. The SplitScore of a split point is a sum of edge scores over a fixed set of edges whose distance can change due to hot-warm splitting BF. This set contains all cover calls in the form of X->Y or Y->X given function order [... X ... BF ... Y ...]; we refer to the sum of edge scores over the set of cover calls as CoverCallScore. This set also contains all jump edges (branches) within BF as well as all call edges originated from BF; we refer to the sum of edge scores over this set of edges as LocalScore. CDSplit finds the split index maximizing CoverCallScore + LocalScore.
abidh · Dec 1, 2023 · 4483cf2 · 4483cf2
1 parent a37c69e
commit 4483cf2
Show file tree

Hide file tree

Showing 3 changed files with 484 additions and 3 deletions.
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
@@ -114,6 +114,16 @@ static cl::opt<double> CallScale(
     "call-scale",
     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
     cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+static cl::opt<double>
+    CallPower("call-power",
+              cl::desc("Call score power (when --split-strategy=cdsplit)"),
+              cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
+
+static cl::opt<double>
+    JumpPower("jump-power",
+              cl::desc("Jump score power (when --split-strategy=cdsplit)"),
+              cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace {
@@ -195,6 +205,13 @@ struct SplitCacheDirected final : public SplitStrategy {
     size_t Count;
   };
 
+  struct SplitScore {
+    size_t SplitIndex;
+    size_t HotSizeReduction = 0;
+    double LocalScore = 0;
+    double CoverCallScore = 0;
+  };
+
   // Auxiliary variables used by the algorithm.
   size_t TotalNumBlocks{0};
   size_t OrigHotSectionSize{0};
@@ -340,8 +357,9 @@ struct SplitCacheDirected final : public SplitStrategy {
       // We only care about new addresses of blocks in hot/warm.
       if (BB->getFragmentNum() == FragmentNum::cold())
         break;
+      const size_t NewSize = BB->getOutputSize();
       BB->setOutputStartAddress(CurrentAddr);
-      CurrentAddr += BB->getOutputSize();
+      CurrentAddr += NewSize;
       BB->setOutputEndAddress(CurrentAddr);
       if (BB->getLayoutIndex() == SplitIndex) {
         NewHotEndAddr = CurrentAddr;
@@ -402,13 +420,192 @@ struct SplitCacheDirected final : public SplitStrategy {
     return CoverCalls;
   }
 
+  /// Compute the edge score of a call edge.
+  double computeCallScore(uint64_t CallCount, size_t CallLength) {
+    // Increase call lengths by 1 to avoid raising 0 to a negative power.
+    return opts::CallScale * static_cast<double>(CallCount) /
+           std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
+  }
+
+  /// Compute the edge score of a jump (branch) edge.
+  double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
+    // Increase jump lengths by 1 to avoid raising 0 to a negative power.
+    return static_cast<double>(JumpCount) /
+           std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
+  }
+
+  /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
+  /// Increament Score.LocalScore in place by the sum.
+  void computeJumpScore(const BasicBlockOrder &BlockOrder,
+                        const size_t SplitIndex, SplitScore &Score) {
+
+    for (const BinaryBasicBlock *SrcBB : BlockOrder) {
+      if (SrcBB->getKnownExecutionCount() == 0)
+        continue;
+
+      const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
+
+      for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
+        const BinaryBasicBlock *DstBB = std::get<0>(Pair);
+        const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
+        const size_t JumpCount = Branch.Count;
+
+        if (JumpCount == 0)
+          continue;
+
+        const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
+        const size_t NewJumpLength =
+            AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
+        Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
+      }
+    }
+  }
+
+  /// Compute sum of scores over calls originated in the current function
+  /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
+  void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex, SplitScore &Score) {
+    if (opts::CallScale == 0)
+      return;
+
+    // Global index of the last block in the current function.
+    // This is later used to determine whether a call originated in the current
+    // function is to a function that comes after the current function.
+    const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
+
+    // The length of calls originated in the input function can increase /
+    // decrease depending on the splitting decision.
+    for (const BinaryBasicBlock *SrcBB : BlockOrder) {
+      const size_t CallCount = SrcBB->getKnownExecutionCount();
+      // If SrcBB does not call any functions, skip it.
+      if (CallCount == 0)
+        continue;
+
+      // Obtain an estimate on the end address of the src basic block
+      // after splitting at SplitIndex.
+      const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
+
+      for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
+        // Obtain an estimate on the start address of the dst basic block
+        // after splitting at SplitIndex. If DstBB is in a function before
+        // the current function, then its start address remains unchanged.
+        size_t DstBBStartAddr = BBOffsets[DstBB];
+        // If DstBB is in a function after the current function, then its
+        // start address should be adjusted based on the reduction in hot size.
+        if (GlobalIndices[DstBB] > LastGlobalIndex) {
+          assert(DstBBStartAddr >= Score.HotSizeReduction);
+          DstBBStartAddr -= Score.HotSizeReduction;
+        }
+        const size_t NewCallLength =
+            AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
+        Score.LocalScore += computeCallScore(CallCount, NewCallLength);
+      }
+    }
+  }
+
+  /// Compute sum of splitting scores for cover calls of the input function.
+  /// Increament Score.CoverCallScore in place by the sum.
+  void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
+                             const size_t SplitIndex,
+                             const std::vector<CallInfo> &CoverCalls,
+                             SplitScore &Score) {
+    if (opts::CallScale == 0)
+      return;
+
+    for (const CallInfo CI : CoverCalls) {
+      assert(CI.Length >= Score.HotSizeReduction &&
+             "Length of cover calls must exceed reduced size of hot fragment.");
+      // Compute the new length of the call, which is shorter than the original
+      // one by the size of the splitted fragment minus the total size increase.
+      const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
+      Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
+    }
+  }
+
+  /// Compute the split score of splitting a function at a given index.
+  /// The split score consists of local score and cover score. Cover call score
+  /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
+  /// compute cover score only when the local score exceeds that in the
+  /// ReferenceScore or that the size reduction of the hot fragment is larger
+  /// than that achieved by the split index of the ReferenceScore. This function
+  /// returns \p Score of SplitScore type. It contains the local score and cover
+  /// score (if computed) of the current splitting index. For easier book
+  /// keeping and comparison, it also stores the split index and the resulting
+  /// reduction in hot fragment size.
+  SplitScore computeSplitScore(const BinaryFunction &BF,
+                               const BasicBlockOrder &BlockOrder,
+                               const size_t SplitIndex,
+                               const std::vector<CallInfo> &CoverCalls,
+                               const SplitScore &ReferenceScore) {
+    // Populate BinaryBasicBlock::OutputAddressRange with estimated
+    // new start and end addresses after hot-warm splitting at SplitIndex.
+    size_t OldHotEnd;
+    size_t NewHotEnd;
+    std::tie(OldHotEnd, NewHotEnd) =
+        estimatePostSplitBBAddress(BlockOrder, SplitIndex);
+
+    SplitScore Score;
+    Score.SplitIndex = SplitIndex;
+
+    // It's not worth splitting if OldHotEnd < NewHotEnd.
+    if (OldHotEnd < NewHotEnd)
+      return Score;
+
+    // Hot fragment size reduction due to splitting.
+    Score.HotSizeReduction = OldHotEnd - NewHotEnd;
+
+    // First part of LocalScore is the sum over call edges originated in the
+    // input function. These edges can get shorter or longer depending on
+    // SplitIndex. Score.LocalScore is increamented in place.
+    computeLocalCallScore(BlockOrder, SplitIndex, Score);
+
+    // Second part of LocalScore is the sum over jump edges with src basic block
+    // and dst basic block in the current function. Score.LocalScore is
+    // increamented in place.
+    computeJumpScore(BlockOrder, SplitIndex, Score);
+
+    // There is no need to compute CoverCallScore if we have already found
+    // another split index with a bigger LocalScore and bigger HotSizeReduction.
+    if (Score.LocalScore <= ReferenceScore.LocalScore &&
+        Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
+      return Score;
+
+    // Compute CoverCallScore and store in Score in place.
+    computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
+    return Score;
+  }
+
   /// Find the best index for splitting. The returned value is the index of the
   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
   /// value which is one less than the size of the function.
   size_t findSplitIndex(const BinaryFunction &BF,
                         const BasicBlockOrder &BlockOrder) {
-    // Placeholder: hot-warm split after entry block.
-    return 0;
+    // Find all function calls that can be shortened if we move blocks of the
+    // current function to warm/cold
+    const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
+
+    // Try all possible split indices (blocks with Index <= SplitIndex are in
+    // hot) and find the one maximizing the splitting score.
+    SplitScore BestScore;
+    double BestScoreSum = -1.0;
+    SplitScore ReferenceScore;
+    for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+      const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
+      // No need to keep cold blocks in the hot section.
+      if (LastHotBB->getFragmentNum() == FragmentNum::cold())
+        break;
+      const SplitScore Score =
+          computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
+      double ScoreSum = Score.LocalScore + Score.CoverCallScore;
+      if (ScoreSum > BestScoreSum) {
+        BestScoreSum = ScoreSum;
+        BestScore = Score;
+      }
+      if (Score.LocalScore > ReferenceScore.LocalScore)
+        ReferenceScore = Score;
+    }
+
+    return BestScore.SplitIndex;
   }
 };
 

diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
@@ -0,0 +1,137 @@
+# Test the control of aggressiveness of 3-way splitting by -call-scale.
+# When -call-scale=0.0, the tested function is 2-way splitted.
+# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
+# in warm because of the increased benefit of shortening the call edges.
+# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
+# in warm because of the strong benefit of shortening the call edges.
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
+# RUN:         --call-scale=0.0 --print-split --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=LOWINCENTIVE %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
+# RUN:         --call-scale=1.0 --print-split --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
+# RUN:         --call-scale=1000.0 --print-split --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s
+
+# LOWINCENTIVE: Binary Function "chain" after split-functions
+# LOWINCENTIVE: {{^\.Ltmp5}}
+# LOWINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# LOWINCENTIVE: {{^\.LFT1}}
+
+# MEDINCENTIVE: Binary Function "chain" after split-functions
+# MEDINCENTIVE: {{^\.Ltmp1}}
+# MEDINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# MEDINCENTIVE: {{^\.LFT1}}
+# MEDINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# MEDINCENTIVE: {{^\.Ltmp0}}
+# MEDINCENTIVE: {{^\.Ltmp2}}
+# MEDINCENTIVE: {{^\.Ltmp3}}
+# MEDINCENTIVE: {{^\.Ltmp4}}
+# MEDINCENTIVE: {{^\.Ltmp5}}
+
+# HIGHINCENTIVE: Binary Function "chain" after split-functions
+# HIGHINCENTIVE: {{^\.LBB00}}
+# HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# HIGHINCENTIVE: {{^\.LFT1}}
+# HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
+# HIGHINCENTIVE: {{^\.LFT0}}
+# HIGHINCENTIVE: {{^\.Ltmp1}}
+# HIGHINCENTIVE: {{^\.Ltmp0}}
+# HIGHINCENTIVE: {{^\.Ltmp2}}
+# HIGHINCENTIVE: {{^\.Ltmp3}}
+# HIGHINCENTIVE: {{^\.Ltmp4}}
+# HIGHINCENTIVE: {{^\.Ltmp5}}
+
+
+
+        .text
+        .globl  chain
+        .type   chain, @function
+chain:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $2, %edi
+LLentry_LLchain_start:
+        jge     LLchain_start
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
+LLfast:
+        movl    $5, %eax
+LLfast_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
+LLchain_start:
+        movl    $10, %eax
+LLchain_start_LLchain1:
+        jge     LLchain1
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
+LLcold:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain1:
+        addl    $1, %eax
+LLchain1_LLchain2:
+        jmp     LLchain2
+# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
+LLchain2:
+        addl    $1, %eax
+LLchain2_LLchain3:
+        jmp     LLchain3
+# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
+LLchain3:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain3_LLchain4:
+        jmp     LLchain4
+# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
+LLchain4:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain4_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
+LLexit:
+        popq    %rbp
+        ret
+LLchain_end:
+        .size   chain, LLchain_end-chain
+
+
+        .globl  main
+        .type   main, @function
+main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        movl    $1, %edi
+LLmain_chain1:
+        call    chain
+# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
+        movl    $4, %edi
+LLmain_chain2:
+        call    chain
+# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Lmain_end:
+        .size   main, .Lmain_end-main