Skip to content

Commit

Permalink
SLPVectorizer: Sort inputs to commutative binary operations
Browse files Browse the repository at this point in the history
Sort the operands of the other entries in the current vectorization root
according to the first entry's operands opcodes.

%conv0 = uitofp ...
%load0 = load float ...

= fmul %conv0, %load0
= fmul %load0, %conv1
= fmul %load0, %conv2

Make sure that we recursively vectorize <%conv0, %conv1, %conv2> and <%load0,
%load0, %load0>.

This makes it more likely to obtain vectorizable trees. We have to be careful
when we sort that we don't destroy 'good' existing ordering implied by source
order.

radar://15080067

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191977 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
aschwaighofer committed Oct 4, 2013
1 parent e3fd646 commit af57bdf
Show file tree
Hide file tree
Showing 2 changed files with 357 additions and 4 deletions.
127 changes: 123 additions & 4 deletions lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,112 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
return true;
}

static bool all_equal(SmallVectorImpl<Value *> &V) {
Value *First = V[0];
for (int i = 1, e = V.size(); i != e; ++i)
if (V[i] != First)
return false;
return true;
}

static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {

SmallVector<Value *, 16> OrigLeft, OrigRight;

bool AllSameOpcodeLeft = true;
bool AllSameOpcodeRight = true;
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
Value *V0 = I->getOperand(0);
Value *V1 = I->getOperand(1);

OrigLeft.push_back(V0);
OrigRight.push_back(V1);

Instruction *I0 = dyn_cast<Instruction>(V0);
Instruction *I1 = dyn_cast<Instruction>(V1);

// Check whether all operands on one side have the same opcode. In this case
// we want to preserve the original order and not make things worse by
// reordering.
AllSameOpcodeLeft = I0;
AllSameOpcodeRight = I1;

if (i && AllSameOpcodeLeft) {
if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
if(P0->getOpcode() != I0->getOpcode())
AllSameOpcodeLeft = false;
} else
AllSameOpcodeLeft = false;
}
if (i && AllSameOpcodeRight) {
if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
if(P1->getOpcode() != I1->getOpcode())
AllSameOpcodeRight = false;
} else
AllSameOpcodeRight = false;
}

// Sort two opcodes. In the code below we try to preserve the ability to use
// broadcast of values instead of individual inserts.
// vl1 = load
// vl2 = phi
// vr1 = load
// vr2 = vr2
// = vl1 x vr1
// = vl2 x vr2
// If we just sorted according to opcode we would leave the first line in
// tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
// = vl1 x vr1
// = vr2 x vl2
// Because vr2 and vr1 are from the same load we loose the opportunity of a
// broadcast for the packed right side in the backend: we have [vr1, vl2]
// instead of [vr1, vr2=vr1].
if (I0 && I1) {
if(!i && I0->getOpcode() > I1->getOpcode()) {
Left.push_back(I1);
Right.push_back(I0);
} else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
// Try not to destroy a broad cast for no apparent benefit.
Left.push_back(I1);
Right.push_back(I0);
} else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
// Try preserve broadcasts.
Left.push_back(I1);
Right.push_back(I0);
} else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
// Try preserve broadcasts.
Left.push_back(I1);
Right.push_back(I0);
} else {
Left.push_back(I0);
Right.push_back(I1);
}
continue;
}
// One opcode, put the instruction on the right.
if (I0) {
Left.push_back(V1);
Right.push_back(I0);
continue;
}
Left.push_back(V0);
Right.push_back(V1);
}

bool LeftBroadcast = all_equal(Left);
bool RightBroadcast = all_equal(Right);

// Don't reorder if the operands where good to begin with.
if (!(LeftBroadcast || RightBroadcast) &&
(AllSameOpcodeRight || AllSameOpcodeLeft)) {
Left = OrigLeft;
Right = OrigRight;
}
}

/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
Expand Down Expand Up @@ -775,6 +881,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
buildTree_rec(Left, Depth + 1);
buildTree_rec(Right, Depth + 1);
return;
}

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
Expand Down Expand Up @@ -1331,10 +1447,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Or:
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
}
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
}

setInsertPointAfterBundle(E->Scalars);

Expand Down
234 changes: 234 additions & 0 deletions test/Transforms/SLPVectorizer/X86/operandorder.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s

target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"



; Make sure we order the operands of commutative operations so that we get
; bigger vectorizable trees.

; CHECK-LABEL: shuffle_operands1
; CHECK: load <2 x double>
; CHECK: fadd <2 x double>

define void @shuffle_operands1(double * noalias %from, double * noalias %to,
double %v1, double %v2) {
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %v0_1, %v1
%v1_2 = fadd double %v2, %v0_2
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
ret void
}

; CHECK-LABEL: shuffle_preserve_broadcast
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %v0_1, %p
%v1_2 = fadd double %v0_1, %v0_2
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}

; CHECK-LABEL: shuffle_preserve_broadcast2
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast2(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %p, %v0_1
%v1_2 = fadd double %v0_2, %v0_1
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}

; CHECK-LABEL: shuffle_preserve_broadcast3
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast3(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %p, %v0_1
%v1_2 = fadd double %v0_1, %v0_2
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}


; CHECK-LABEL: shuffle_preserve_broadcast4
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast4(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %v0_2, %v0_1
%v1_2 = fadd double %p, %v0_1
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}

; CHECK-LABEL: shuffle_preserve_broadcast5
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast5(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %v0_1, %v0_2
%v1_2 = fadd double %p, %v0_1
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}


; CHECK-LABEL: shuffle_preserve_broadcast6
; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
; CHECK: = insertelement <2 x double> %[[BCAST]], double %v0_1
define void @shuffle_preserve_broadcast6(double * noalias %from,
double * noalias %to,
double %v1, double %v2) {
entry:
br label %lp

lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double *%from, i64 1
%v0_1 = load double * %from
%v0_2 = load double * %from_1
%v1_1 = fadd double %v0_1, %v0_2
%v1_2 = fadd double %v0_1, %p
%to_2 = getelementptr double * %to, i64 1
store double %v1_1, double *%to
store double %v1_2, double *%to_2
br i1 undef, label %lp, label %ext

ext:
ret void
}

; Make sure we don't scramble operands when we reorder them and destroy
; 'good' source order.

; CHECK-LABEL: good_load_order

; CHECK: %[[V1:[0-9]+]] = load <4 x float>*
; CHECK: %[[V2:[0-9]+]] = insertelement <4 x float> undef, float %1, i32 0
; CHECK: %[[V3:[0-9]+]] = shufflevector <4 x float> %[[V2]], <4 x float> %[[V1]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
; CHECK: = fmul <4 x float> %[[V1]], %[[V3]]

@a = common global [32000 x float] zeroinitializer, align 16

define void @good_load_order() {
entry:
br label %for.cond1.preheader

for.cond1.preheader:
%0 = load float* getelementptr inbounds ([32000 x float]* @a, i64 0, i64 0), align 16
br label %for.body3

for.body3:
%1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
%2 = add nsw i64 %indvars.iv, 1
%arrayidx = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %2
%3 = load float* %arrayidx, align 4
%arrayidx5 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv
%mul6 = fmul float %3, %1
store float %mul6, float* %arrayidx5, align 4
%4 = add nsw i64 %indvars.iv, 2
%arrayidx11 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %4
%5 = load float* %arrayidx11, align 4
%mul15 = fmul float %5, %3
store float %mul15, float* %arrayidx, align 4
%6 = add nsw i64 %indvars.iv, 3
%arrayidx21 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %6
%7 = load float* %arrayidx21, align 4
%mul25 = fmul float %7, %5
store float %mul25, float* %arrayidx11, align 4
%8 = add nsw i64 %indvars.iv, 4
%arrayidx31 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %8
%9 = load float* %arrayidx31, align 4
%mul35 = fmul float %9, %7
store float %mul35, float* %arrayidx21, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
%arrayidx41 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv.next
%10 = load float* %arrayidx41, align 4
%mul45 = fmul float %10, %9
store float %mul45, float* %arrayidx31, align 4
%11 = trunc i64 %indvars.iv.next to i32
%cmp2 = icmp slt i32 %11, 31995
br i1 %cmp2, label %for.body3, label %for.end

for.end:
ret void
}

0 comments on commit af57bdf

Please sign in to comment.