Skip to content

Commit

Permalink
[BOLT] CDSplit main logic part 2/2 (llvm#74032)
Browse files Browse the repository at this point in the history
This diff implements the main splitting logic of CDSplit. CDSplit
processes functions in a binary in parallel. For each function BF, it
assumes that all other functions are hot-cold split. For each possible
hot-warm split point of BF, it computes its corresponding SplitScore,
and chooses the split point with the best SplitScore. The SplitScore of
each split point is computed in the following way: each call edge or
jump edge has an edge score that is proportional to its execution count,
and inversely proportional to its distance. The SplitScore of a split
point is a sum of edge scores over a fixed set of edges whose distance
can change due to hot-warm splitting BF. This set contains all cover
calls in the form of X->Y or Y->X given function order [... X ... BF ...
Y ...]; we refer to the sum of edge scores over the set of cover calls
as CoverCallScore. This set also contains all jump edges (branches)
within BF as well as all call edges originated from BF; we refer to the
sum of edge scores over this set of edges as LocalScore. CDSplit finds
the split index maximizing CoverCallScore + LocalScore.
  • Loading branch information
ShatianWang authored Dec 1, 2023
1 parent a37c69e commit 4483cf2
Show file tree
Hide file tree
Showing 3 changed files with 484 additions and 3 deletions.
203 changes: 200 additions & 3 deletions bolt/lib/Passes/SplitFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@ static cl::opt<double> CallScale(
"call-scale",
cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));

static cl::opt<double>
CallPower("call-power",
cl::desc("Call score power (when --split-strategy=cdsplit)"),
cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));

static cl::opt<double>
JumpPower("jump-power",
cl::desc("Jump score power (when --split-strategy=cdsplit)"),
cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
} // namespace opts

namespace {
Expand Down Expand Up @@ -195,6 +205,13 @@ struct SplitCacheDirected final : public SplitStrategy {
size_t Count;
};

struct SplitScore {
size_t SplitIndex;
size_t HotSizeReduction = 0;
double LocalScore = 0;
double CoverCallScore = 0;
};

// Auxiliary variables used by the algorithm.
size_t TotalNumBlocks{0};
size_t OrigHotSectionSize{0};
Expand Down Expand Up @@ -340,8 +357,9 @@ struct SplitCacheDirected final : public SplitStrategy {
// We only care about new addresses of blocks in hot/warm.
if (BB->getFragmentNum() == FragmentNum::cold())
break;
const size_t NewSize = BB->getOutputSize();
BB->setOutputStartAddress(CurrentAddr);
CurrentAddr += BB->getOutputSize();
CurrentAddr += NewSize;
BB->setOutputEndAddress(CurrentAddr);
if (BB->getLayoutIndex() == SplitIndex) {
NewHotEndAddr = CurrentAddr;
Expand Down Expand Up @@ -402,13 +420,192 @@ struct SplitCacheDirected final : public SplitStrategy {
return CoverCalls;
}

/// Compute the edge score of a call edge.
double computeCallScore(uint64_t CallCount, size_t CallLength) {
// Increase call lengths by 1 to avoid raising 0 to a negative power.
return opts::CallScale * static_cast<double>(CallCount) /
std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
}

/// Compute the edge score of a jump (branch) edge.
double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
// Increase jump lengths by 1 to avoid raising 0 to a negative power.
return static_cast<double>(JumpCount) /
std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
}

/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
/// Increament Score.LocalScore in place by the sum.
void computeJumpScore(const BasicBlockOrder &BlockOrder,
const size_t SplitIndex, SplitScore &Score) {

for (const BinaryBasicBlock *SrcBB : BlockOrder) {
if (SrcBB->getKnownExecutionCount() == 0)
continue;

const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;

for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
const BinaryBasicBlock *DstBB = std::get<0>(Pair);
const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
const size_t JumpCount = Branch.Count;

if (JumpCount == 0)
continue;

const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
const size_t NewJumpLength =
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
}
}
}

/// Compute sum of scores over calls originated in the current function
/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
const size_t SplitIndex, SplitScore &Score) {
if (opts::CallScale == 0)
return;

// Global index of the last block in the current function.
// This is later used to determine whether a call originated in the current
// function is to a function that comes after the current function.
const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];

// The length of calls originated in the input function can increase /
// decrease depending on the splitting decision.
for (const BinaryBasicBlock *SrcBB : BlockOrder) {
const size_t CallCount = SrcBB->getKnownExecutionCount();
// If SrcBB does not call any functions, skip it.
if (CallCount == 0)
continue;

// Obtain an estimate on the end address of the src basic block
// after splitting at SplitIndex.
const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;

for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
// Obtain an estimate on the start address of the dst basic block
// after splitting at SplitIndex. If DstBB is in a function before
// the current function, then its start address remains unchanged.
size_t DstBBStartAddr = BBOffsets[DstBB];
// If DstBB is in a function after the current function, then its
// start address should be adjusted based on the reduction in hot size.
if (GlobalIndices[DstBB] > LastGlobalIndex) {
assert(DstBBStartAddr >= Score.HotSizeReduction);
DstBBStartAddr -= Score.HotSizeReduction;
}
const size_t NewCallLength =
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
Score.LocalScore += computeCallScore(CallCount, NewCallLength);
}
}
}

/// Compute sum of splitting scores for cover calls of the input function.
/// Increament Score.CoverCallScore in place by the sum.
void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
const size_t SplitIndex,
const std::vector<CallInfo> &CoverCalls,
SplitScore &Score) {
if (opts::CallScale == 0)
return;

for (const CallInfo CI : CoverCalls) {
assert(CI.Length >= Score.HotSizeReduction &&
"Length of cover calls must exceed reduced size of hot fragment.");
// Compute the new length of the call, which is shorter than the original
// one by the size of the splitted fragment minus the total size increase.
const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
}
}

/// Compute the split score of splitting a function at a given index.
/// The split score consists of local score and cover score. Cover call score
/// is expensive to compute. As a result, we pass in a \p ReferenceScore and
/// compute cover score only when the local score exceeds that in the
/// ReferenceScore or that the size reduction of the hot fragment is larger
/// than that achieved by the split index of the ReferenceScore. This function
/// returns \p Score of SplitScore type. It contains the local score and cover
/// score (if computed) of the current splitting index. For easier book
/// keeping and comparison, it also stores the split index and the resulting
/// reduction in hot fragment size.
SplitScore computeSplitScore(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder,
const size_t SplitIndex,
const std::vector<CallInfo> &CoverCalls,
const SplitScore &ReferenceScore) {
// Populate BinaryBasicBlock::OutputAddressRange with estimated
// new start and end addresses after hot-warm splitting at SplitIndex.
size_t OldHotEnd;
size_t NewHotEnd;
std::tie(OldHotEnd, NewHotEnd) =
estimatePostSplitBBAddress(BlockOrder, SplitIndex);

SplitScore Score;
Score.SplitIndex = SplitIndex;

// It's not worth splitting if OldHotEnd < NewHotEnd.
if (OldHotEnd < NewHotEnd)
return Score;

// Hot fragment size reduction due to splitting.
Score.HotSizeReduction = OldHotEnd - NewHotEnd;

// First part of LocalScore is the sum over call edges originated in the
// input function. These edges can get shorter or longer depending on
// SplitIndex. Score.LocalScore is increamented in place.
computeLocalCallScore(BlockOrder, SplitIndex, Score);

// Second part of LocalScore is the sum over jump edges with src basic block
// and dst basic block in the current function. Score.LocalScore is
// increamented in place.
computeJumpScore(BlockOrder, SplitIndex, Score);

// There is no need to compute CoverCallScore if we have already found
// another split index with a bigger LocalScore and bigger HotSizeReduction.
if (Score.LocalScore <= ReferenceScore.LocalScore &&
Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
return Score;

// Compute CoverCallScore and store in Score in place.
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
return Score;
}

/// Find the best index for splitting. The returned value is the index of the
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
/// value which is one less than the size of the function.
size_t findSplitIndex(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder) {
// Placeholder: hot-warm split after entry block.
return 0;
// Find all function calls that can be shortened if we move blocks of the
// current function to warm/cold
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);

// Try all possible split indices (blocks with Index <= SplitIndex are in
// hot) and find the one maximizing the splitting score.
SplitScore BestScore;
double BestScoreSum = -1.0;
SplitScore ReferenceScore;
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
// No need to keep cold blocks in the hot section.
if (LastHotBB->getFragmentNum() == FragmentNum::cold())
break;
const SplitScore Score =
computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
double ScoreSum = Score.LocalScore + Score.CoverCallScore;
if (ScoreSum > BestScoreSum) {
BestScoreSum = ScoreSum;
BestScore = Score;
}
if (Score.LocalScore > ReferenceScore.LocalScore)
ReferenceScore = Score;
}

return BestScore.SplitIndex;
}
};

Expand Down
137 changes: 137 additions & 0 deletions bolt/test/X86/cdsplit-call-scale.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Test the control of aggressiveness of 3-way splitting by -call-scale.
# When -call-scale=0.0, the tested function is 2-way splitted.
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
# in warm because of the increased benefit of shortening the call edges.
# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
# in warm because of the strong benefit of shortening the call edges.

# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
# RUN: --call-scale=0.0 --print-split --print-only=chain \
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
# RUN: 2>&1 | FileCheck --check-prefix=LOWINCENTIVE %s
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
# RUN: --call-scale=1.0 --print-split --print-only=chain \
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
# RUN: 2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
# RUN: --call-scale=1000.0 --print-split --print-only=chain \
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
# RUN: 2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s

# LOWINCENTIVE: Binary Function "chain" after split-functions
# LOWINCENTIVE: {{^\.Ltmp5}}
# LOWINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# LOWINCENTIVE: {{^\.LFT1}}

# MEDINCENTIVE: Binary Function "chain" after split-functions
# MEDINCENTIVE: {{^\.Ltmp1}}
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# MEDINCENTIVE: {{^\.LFT1}}
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# MEDINCENTIVE: {{^\.Ltmp0}}
# MEDINCENTIVE: {{^\.Ltmp2}}
# MEDINCENTIVE: {{^\.Ltmp3}}
# MEDINCENTIVE: {{^\.Ltmp4}}
# MEDINCENTIVE: {{^\.Ltmp5}}

# HIGHINCENTIVE: Binary Function "chain" after split-functions
# HIGHINCENTIVE: {{^\.LBB00}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# HIGHINCENTIVE: {{^\.LFT1}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# HIGHINCENTIVE: {{^\.LFT0}}
# HIGHINCENTIVE: {{^\.Ltmp1}}
# HIGHINCENTIVE: {{^\.Ltmp0}}
# HIGHINCENTIVE: {{^\.Ltmp2}}
# HIGHINCENTIVE: {{^\.Ltmp3}}
# HIGHINCENTIVE: {{^\.Ltmp4}}
# HIGHINCENTIVE: {{^\.Ltmp5}}



.text
.globl chain
.type chain, @function
chain:
pushq %rbp
movq %rsp, %rbp
cmpl $2, %edi
LLentry_LLchain_start:
jge LLchain_start
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
LLfast:
movl $5, %eax
LLfast_LLexit:
jmp LLexit
# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
LLchain_start:
movl $10, %eax
LLchain_start_LLchain1:
jge LLchain1
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
LLcold:
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
LLchain1:
addl $1, %eax
LLchain1_LLchain2:
jmp LLchain2
# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
LLchain2:
addl $1, %eax
LLchain2_LLchain3:
jmp LLchain3
# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
LLchain3:
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
LLchain3_LLchain4:
jmp LLchain4
# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
LLchain4:
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
addl $1, %eax
LLchain4_LLexit:
jmp LLexit
# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
LLexit:
popq %rbp
ret
LLchain_end:
.size chain, LLchain_end-chain


.globl main
.type main, @function
main:
pushq %rbp
movq %rsp, %rbp
movl $1, %edi
LLmain_chain1:
call chain
# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
movl $4, %edi
LLmain_chain2:
call chain
# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
xorl %eax, %eax
popq %rbp
retq
.Lmain_end:
.size main, .Lmain_end-main
Loading

0 comments on commit 4483cf2

Please sign in to comment.