Skip to content

Commit

Permalink
[X86] Lower VSELECT into SHRUNKBLEND when we shrink the bits used int…
Browse files Browse the repository at this point in the history
…o the

condition to match a blend.
This prevents optimizations that work on VSELECT to perform invalid
transformations. Indeed, the optimized condition does not match the vector
boolean content that is expected and bad things may happen.

This patch yields the exact same code on the whole test-suite + specs (-O3 and
-O3 -march=core-avx2), it improves one test case (vector-blend.ll) and fixes a
bug reduced in vselect-avx.ll.

<rdar://problem/18819506>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221429 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
Quentin Colombet committed Nov 6, 2014
1 parent 4ebcbd0 commit d465ece
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 25 deletions.
10 changes: 10 additions & 0 deletions lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2131,6 +2131,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
case X86ISD::GlobalBaseReg:
return getGlobalBaseReg();

case X86ISD::SHRUNKBLEND: {
// SHRUNKBLEND selects like a regular VSELECT.
SDValue VSelect = CurDAG->getNode(
ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
ReplaceUses(SDValue(Node, 0), VSelect);
SelectCode(VSelect.getNode());
// We already called ReplaceUses.
return nullptr;
}

case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND:
Expand Down
70 changes: 51 additions & 19 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19025,6 +19025,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
Expand Down Expand Up @@ -22618,36 +22619,63 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// build_vector of constants. This will be taken care in a later
// condition.
(TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
VT != MVT::v8i16)) {
VT != MVT::v8i16) &&
// Don't optimize vector of constants. Those are handled by
// the generic code and all the bits must be properly set for
// the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();

// Don't optimize vector selects that map to mask-registers.
if (BitWidth == 1)
return SDValue();

// Check all uses of that condition operand to check whether it will be
// consumed by non-BLEND instructions, which may depend on all bits are set
// properly.
for (SDNode::use_iterator I = Cond->use_begin(),
E = Cond->use_end(); I != E; ++I)
if (I->getOpcode() != ISD::VSELECT)
// TODO: Add other opcodes eventually lowered into BLEND.
return SDValue();

assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);

APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
(TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
TLO) &&
// Don't optimize vector of constants. Those are handled by
// the generic code and all the bits must be properly set for
// the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
DCI.CommitTargetLoweringOpt(TLO);
TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
TLO)) {
// If we changed the computation somewhere in the DAG, this change
// will affect all users of Cond.
// Make sure it is fine and update all the nodes so that we do not
// use the generic VSELECT anymore. Otherwise, we may perform
// wrong optimizations as we messed up with the actual expectation
// for the vector boolean values.
if (Cond != TLO.Old) {
// Check all uses of that condition operand to check whether it will be
// consumed by non-BLEND instructions, which may depend on all bits are
// set properly.
for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
I != E; ++I)
if (I->getOpcode() != ISD::VSELECT)
// TODO: Add other opcodes eventually lowered into BLEND.
return SDValue();

// Update all the users of the condition, before committing the change,
// so that the VSELECT optimizations that expect the correct vector
// boolean value will not be triggered.
for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
I != E; ++I)
DAG.ReplaceAllUsesOfValueWith(
SDValue(*I, 0),
DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
Cond, I->getOperand(1), I->getOperand(2)));
DCI.CommitTargetLoweringOpt(TLO);
return SDValue();
}
// At this point, only Cond is changed. Change the condition
// just for N to keep the opportunity to optimize all other
// users their own way.
DAG.ReplaceAllUsesOfValueWith(
SDValue(N, 0),
DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
TLO.New, N->getOperand(1), N->getOperand(2)));
return SDValue();
}
}

// We should generate an X86ISD::BLENDI from a vselect if its argument
Expand All @@ -22661,7 +22689,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Iff we find this pattern and the build_vectors are built from
// constants, we translate the vselect into a shuffle_vector that we
// know will be matched by LowerVECTOR_SHUFFLEtoBlend.
if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
if ((N->getOpcode() == ISD::VSELECT ||
N->getOpcode() == X86ISD::SHRUNKBLEND) &&
!DCI.isBeforeLegalize()) {
SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
if (Shuffle.getNode())
return Shuffle;
Expand Down Expand Up @@ -24854,7 +24884,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,11 @@ namespace llvm {
/// BLENDI - Blend where the selector is an immediate.
BLENDI,

/// SHRUNKBLEND - Blend where the condition has been shrunk.
/// This is used to emphasize that the condition mask is
/// no more valid for generic VSELECT optimizations.
SHRUNKBLEND,

/// ADDSUB - Combined add and sub on an FP vector.
ADDSUB,

Expand Down
12 changes: 6 additions & 6 deletions test/CodeGen/X86/vector-blend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,9 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
; SSE41-LABEL: vsel_double8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1]
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1]
; SSE41-NEXT: movaps %xmm5, %xmm1
; SSE41-NEXT: movaps %xmm7, %xmm3
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_double8:
Expand Down Expand Up @@ -353,10 +353,10 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
;
; SSE41-LABEL: vsel_i648:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1]
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
; SSE41-NEXT: movaps %xmm5, %xmm1
; SSE41-NEXT: movaps %xmm7, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_i648:
Expand Down
58 changes: 58 additions & 0 deletions test/CodeGen/X86/vselect-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,61 @@ body:
store <4 x i16> %predphi42, <4 x i16>* %b, align 8
ret void
}

; Improve code coverage.
;
; When shrinking the condition used into the select to match a blend, this
; test case exercises the path where the modified node is not the root
; of the condition.
;
; CHECK-LABEL: test2:
; CHECK: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
; CHECK: vblendvpd [[MASK]]
; CHECK: retq
define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
bb:
%arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
%tmp1888 = load double** %arrayidx1928, align 8
%predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
%tmp1900 = bitcast double* %tmp1888 to <4 x double>*
store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
ret void
}

; For this test, we used to optimized the conditional mask for the blend, i.e.,
; we shrunk some of its bits.
; However, this same mask was used in another select (%predphi31) that turned out
; to be optimized into a and. In that case, the conditional mask was wrong.
;
; Make sure that the and is fed by the original mask.
;
; <rdar://problem/18819506>

; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
; even a faulty pattern would pass!
;
; CHECK-LABEL: test3:
; Compute the original mask.
; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
; Shrink the bit of the mask.
; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
; Use the shrunk mask in the blend.
; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
; Use the original mask in the and.
; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}}
; CHECK: retq
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
%tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
%predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
%predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer

store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
ret void
}

0 comments on commit d465ece

Please sign in to comment.