Skip to content

Commit

Permalink
[PowerPC] Add profitablilty check for conversion to mtctr loops
Browse files Browse the repository at this point in the history
Add profitability checks for modifying counted loops to use the mtctr instruction.

The latency of mtctr is only justified if there are more than 4 comparisons that
will be removed as a result.  Usually counted loops are formed relatively early
and before unrolling, so most low trip count loops often don't survive.  However
we want to ensure that if they do, we do not mistakenly update them to mtctr loops.

Use CodeMetrics to ensure we are only doing this for small loops with small trip counts.

Differential Revision: https://reviews.llvm.org/D38212

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315592 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
lei137 committed Oct 12, 2017
1 parent fadd83b commit 818cdb5
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 6 deletions.
33 changes: 32 additions & 1 deletion lib/Target/PowerPC/PPCCTRLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,17 @@
#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "PPCTargetTransformInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
Expand Down Expand Up @@ -64,6 +69,13 @@ using namespace llvm;
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif

// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
static cl::opt<unsigned>
SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
cl::desc("Loops with a constant trip count smaller than "
"this value will not use the count register."));

STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");

namespace llvm {
Expand Down Expand Up @@ -95,6 +107,8 @@ namespace {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}

private:
Expand All @@ -107,10 +121,12 @@ namespace {
const PPCTargetLowering *TLI;
const DataLayout *DL;
const TargetLibraryInfo *LibInfo;
const TargetTransformInfo *TTI;
LoopInfo *LI;
ScalarEvolution *SE;
DominatorTree *DT;
bool PreserveLCSSA;
TargetSchedModel SchedModel;
};

char PPCCTRLoops::ID = 0;
Expand Down Expand Up @@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
Expand Down Expand Up @@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {

return false;
}

bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
bool MadeChange = false;

// Do not convert small short loops to CTR loop.
unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
SmallPtrSet<const Value *, 32> EphValues;
auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
// 6 is an approximate latency for the mtctr instruction.
if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
return false;
}

// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);
Expand Down
47 changes: 42 additions & 5 deletions test/CodeGen/PowerPC/ctr-minmaxnum.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
target triple = "powerpc64-unknown-linux-gnu"
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX

declare float @fabsf(float)

Expand Down Expand Up @@ -38,6 +37,9 @@ loop_exit:
; CHECK-LABEL: test1:
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK-NOT: mtctr
; CHECK: blr

define void @test1v(<4 x float> %f, <4 x float>* %fp) {
entry:
Expand All @@ -48,16 +50,23 @@ loop_body:
%0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %0, <4 x float>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
%2 = icmp eq i64 %1, 2
%2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body

loop_exit:
ret void
}

; CHECK-LABEL: test1v:
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK: blr

; QPX-LABEL: test1v:
; QPX: mtctr
Expand All @@ -83,6 +92,9 @@ loop_exit:
; CHECK-LABEL: test1a:
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK-NOT: mtctr
; CHECK: blr

define void @test2(float %f, float* %fp) {
entry:
Expand All @@ -103,6 +115,9 @@ loop_exit:
; CHECK-LABEL: test2:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
; CHECK-NOT: bl fmaxf
; CHECK-NOT: mtctr
; CHECK: blr

define void @test2v(<4 x double> %f, <4 x double>* %fp) {
entry:
Expand All @@ -113,16 +128,23 @@ loop_body:
%0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>)
store <4 x double> %0, <4 x double>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
%2 = icmp eq i64 %1, 2
%2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body

loop_exit:
ret void
}

; CHECK-LABEL: test2v:
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK: blr

; QPX-LABEL: test2v:
; QPX: mtctr
Expand All @@ -148,6 +170,9 @@ loop_exit:
; CHECK-LABEL: test2a:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
; CHECK-NOT: bl fmaxf
; CHECK-NOT: mtctr
; CHECK: blr

define void @test3(double %f, double* %fp) {
entry:
Expand All @@ -168,6 +193,9 @@ loop_exit:
; CHECK-LABEL: test3:
; CHECK-NOT: mtctr
; CHECK: bl fmin
; CHECK-NOT: bl fmin
; CHECK-NOT: mtctr
; CHECK: blr

define void @test3a(double %f, double* %fp) {
entry:
Expand All @@ -188,6 +216,9 @@ loop_exit:
; CHECK-LABEL: test3a:
; CHECK-NOT: mtctr
; CHECK: bl fmin
; CHECK-NOT: bl fmin
; CHECK-NOT: mtctr
; CHECK: blr

define void @test4(double %f, double* %fp) {
entry:
Expand All @@ -208,6 +239,9 @@ loop_exit:
; CHECK-LABEL: test4:
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK-NOT: mtctr
; CHECK: blr

define void @test4a(double %f, double* %fp) {
entry:
Expand All @@ -228,4 +262,7 @@ loop_exit:
; CHECK-LABEL: test4a:
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK-NOT: mtctr
; CHECK: blr

116 changes: 116 additions & 0 deletions test/CodeGen/PowerPC/ctrloop-shortLoops.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s

; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4
; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result.

@a = common local_unnamed_addr global i32 0, align 4
@arr = common local_unnamed_addr global [5 x i32] zeroinitializer, align 4

; Function Attrs: norecurse nounwind readonly
define signext i32 @testTripCount2(i32 signext %a) {

; CHECK-LABEL: testTripCount2:
; CHECK-NOT: mtctr
; CHECK: blr

entry:
br label %for.body

for.cond.cleanup: ; preds = %for.body
ret i32 %add

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}

; Function Attrs: norecurse nounwind readonly
define signext i32 @testTripCount3(i32 signext %a) {

; CHECK-LABEL: testTripCount3:
; CHECK-NOT: mtctr
; CHECK: blr

entry:
br label %for.body

for.cond.cleanup: ; preds = %for.body
ret i32 %add

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 2, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}

; Function Attrs: norecurse nounwind readonly

define signext i32 @testTripCount4(i32 signext %a) {

; CHECK-LABEL: testTripCount4:
; CHECK: mtctr
; CHECK: bdnz

entry:
br label %for.body

for.cond.cleanup: ; preds = %for.body
ret i32 %add

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 3, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}

; Function Attrs: norecurse nounwind
define signext i32 @testTripCount2NonSmallLoop() {

; CHECK-LABEL: testTripCount2NonSmallLoop:
; CHECK: mtctr
; CHECK: blr

entry:
%.pre = load i32, i32* @a, align 4
br label %for.body

for.body: ; preds = %entry, %if.end
%0 = phi i32 [ %.pre, %entry ], [ %1, %if.end ]
%dec4 = phi i32 [ 1, %entry ], [ %dec, %if.end ]
%b.03 = phi i8 [ 0, %entry ], [ %b.1, %if.end ]
%tobool1 = icmp eq i32 %0, 0
br i1 %tobool1, label %if.end, label %if.then

if.then: ; preds = %for.body
store i32 2, i32* @a, align 4
br label %if.end

if.end: ; preds = %for.body, %if.then
%1 = phi i32 [ 2, %if.then ], [ 0, %for.body ]
%b.1 = phi i8 [ 2, %if.then ], [ %b.03, %for.body ]
%dec = add nsw i32 %dec4, -1
%tobool = icmp eq i32 %dec4, 0
br i1 %tobool, label %for.end, label %for.body

for.end: ; preds = %if.end
%conv = zext i8 %b.1 to i32
ret i32 %conv
}

0 comments on commit 818cdb5

Please sign in to comment.