Skip to content

Commit

Permalink
[VectorLegalizer] Expansion of CTLZ using CTPOP when possible
Browse files Browse the repository at this point in the history
This patch avoids scalarization of CTLZ by instead expanding to use CTPOP (ref: "Hacker's Delight") when the necessary operations are available.

This also adds the necessary cost models for X86 SSE2 targets (the main beneficiary) to ensure vectorization only happens when its useful.

Differential Revision: https://reviews.llvm.org/D25910

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286233 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
RKSimon committed Nov 8, 2016
1 parent 174270c commit 169b408
Show file tree
Hide file tree
Showing 6 changed files with 726 additions and 1,282 deletions.
56 changes: 50 additions & 6 deletions lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ class VectorLegalizer {
SDValue ExpandStore(SDValue Op);
SDValue ExpandFNEG(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);
SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);

/// \brief Implements vector promotion.
///
Expand Down Expand Up @@ -693,9 +694,11 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
return UnrollVSETCC(Op);
case ISD::BITREVERSE:
return ExpandBITREVERSE(Op);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return ExpandCTLZ(Op);
case ISD::CTTZ_ZERO_UNDEF:
return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op);
return ExpandCTTZ_ZERO_UNDEF(Op);
default:
return DAG.UnrollVectorOp(Op.getNode());
}
Expand Down Expand Up @@ -1022,12 +1025,53 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
return DAG.UnrollVectorOp(Op.getNode());
}

SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) {
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
EVT VT = Op.getValueType();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();

// If the non-ZERO_UNDEF version is supported we can use that instead.
if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) {
SDLoc DL(Op);
return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0));
}

// If CTPOP is available we can lower with a CTPOP based method:
// u16 ctlz(u16 x) {
// x |= (x >> 1);
// x |= (x >> 2);
// x |= (x >> 4);
// x |= (x >> 8);
// return ctpop(~x);
// }
// Ref: "Hacker's Delight" by Henry Warren
if (isPowerOf2_32(NumBitsPerElt) &&
TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) &&
TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) {
SDLoc DL(Op);
SDValue Res = Op.getOperand(0);
EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

for (unsigned i = 1; i != NumBitsPerElt; i *= 2)
Res = DAG.getNode(
ISD::OR, DL, VT, Res,
DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy)));

Res = DAG.getNOT(DL, Res, VT);
return DAG.getNode(ISD::CTPOP, DL, VT, Res);
}

// Otherwise go ahead and unroll.
return DAG.UnrollVectorOp(Op.getNode());
}

SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
// If the non-ZERO_UNDEF version is supported we can use that instead.
unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ;
if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) {
if (TLI.isOperationLegalOrCustom(ISD::CTTZ, Op.getValueType())) {
SDLoc DL(Op);
return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(0));
return DAG.getNode(ISD::CTTZ, DL, Op.getValueType(), Op.getOperand(0));
}

// Otherwise go ahead and unroll.
Expand Down
5 changes: 4 additions & 1 deletion lib/Target/X86/X86TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BSWAP, MVT::v2i64, 7 },
{ ISD::BSWAP, MVT::v4i32, 7 },
{ ISD::BSWAP, MVT::v8i16, 7 },
/* ISD::CTLZ - currently scalarized pre-SSSE3 */
{ ISD::CTLZ, MVT::v2i64, 25 },
{ ISD::CTLZ, MVT::v4i32, 26 },
{ ISD::CTLZ, MVT::v8i16, 20 },
{ ISD::CTLZ, MVT::v16i8, 17 },
{ ISD::CTPOP, MVT::v2i64, 12 },
{ ISD::CTPOP, MVT::v4i32, 15 },
{ ISD::CTPOP, MVT::v8i16, 13 },
Expand Down
32 changes: 16 additions & 16 deletions test/Analysis/CostModel/X86/ctbits-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)

define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64':
; SSE2: Found an estimated cost of 6 for instruction: %ctlz
; SSE2: Found an estimated cost of 25 for instruction: %ctlz
; SSE42: Found an estimated cost of 23 for instruction: %ctlz
; AVX: Found an estimated cost of 23 for instruction: %ctlz
%ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
Expand All @@ -218,7 +218,7 @@ define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {

define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64u':
; SSE2: Found an estimated cost of 6 for instruction: %ctlz
; SSE2: Found an estimated cost of 25 for instruction: %ctlz
; SSE42: Found an estimated cost of 23 for instruction: %ctlz
; AVX: Found an estimated cost of 23 for instruction: %ctlz
%ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
Expand All @@ -227,7 +227,7 @@ define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {

define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64':
; SSE2: Found an estimated cost of 12 for instruction: %ctlz
; SSE2: Found an estimated cost of 50 for instruction: %ctlz
; SSE42: Found an estimated cost of 46 for instruction: %ctlz
; AVX1: Found an estimated cost of 46 for instruction: %ctlz
; AVX2: Found an estimated cost of 23 for instruction: %ctlz
Expand All @@ -237,7 +237,7 @@ define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {

define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64u':
; SSE2: Found an estimated cost of 12 for instruction: %ctlz
; SSE2: Found an estimated cost of 50 for instruction: %ctlz
; SSE42: Found an estimated cost of 46 for instruction: %ctlz
; AVX1: Found an estimated cost of 46 for instruction: %ctlz
; AVX2: Found an estimated cost of 23 for instruction: %ctlz
Expand All @@ -247,7 +247,7 @@ define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {

define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32':
; SSE2: Found an estimated cost of 12 for instruction: %ctlz
; SSE2: Found an estimated cost of 26 for instruction: %ctlz
; SSE42: Found an estimated cost of 18 for instruction: %ctlz
; AVX: Found an estimated cost of 18 for instruction: %ctlz
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
Expand All @@ -256,7 +256,7 @@ define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {

define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32u':
; SSE2: Found an estimated cost of 12 for instruction: %ctlz
; SSE2: Found an estimated cost of 26 for instruction: %ctlz
; SSE42: Found an estimated cost of 18 for instruction: %ctlz
; AVX: Found an estimated cost of 18 for instruction: %ctlz
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
Expand All @@ -265,7 +265,7 @@ define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {

define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32':
; SSE2: Found an estimated cost of 24 for instruction: %ctlz
; SSE2: Found an estimated cost of 52 for instruction: %ctlz
; SSE42: Found an estimated cost of 36 for instruction: %ctlz
; AVX1: Found an estimated cost of 36 for instruction: %ctlz
; AVX2: Found an estimated cost of 18 for instruction: %ctlz
Expand All @@ -275,7 +275,7 @@ define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {

define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32u':
; SSE2: Found an estimated cost of 24 for instruction: %ctlz
; SSE2: Found an estimated cost of 52 for instruction: %ctlz
; SSE42: Found an estimated cost of 36 for instruction: %ctlz
; AVX1: Found an estimated cost of 36 for instruction: %ctlz
; AVX2: Found an estimated cost of 18 for instruction: %ctlz
Expand All @@ -285,7 +285,7 @@ define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {

define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16':
; SSE2: Found an estimated cost of 24 for instruction: %ctlz
; SSE2: Found an estimated cost of 20 for instruction: %ctlz
; SSE42: Found an estimated cost of 14 for instruction: %ctlz
; AVX: Found an estimated cost of 14 for instruction: %ctlz
%ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
Expand All @@ -294,7 +294,7 @@ define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {

define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16u':
; SSE2: Found an estimated cost of 24 for instruction: %ctlz
; SSE2: Found an estimated cost of 20 for instruction: %ctlz
; SSE42: Found an estimated cost of 14 for instruction: %ctlz
; AVX: Found an estimated cost of 14 for instruction: %ctlz
%ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
Expand All @@ -303,7 +303,7 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {

define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16':
; SSE2: Found an estimated cost of 48 for instruction: %ctlz
; SSE2: Found an estimated cost of 40 for instruction: %ctlz
; SSE42: Found an estimated cost of 28 for instruction: %ctlz
; AVX1: Found an estimated cost of 28 for instruction: %ctlz
; AVX2: Found an estimated cost of 14 for instruction: %ctlz
Expand All @@ -313,7 +313,7 @@ define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {

define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16u':
; SSE2: Found an estimated cost of 48 for instruction: %ctlz
; SSE2: Found an estimated cost of 40 for instruction: %ctlz
; SSE42: Found an estimated cost of 28 for instruction: %ctlz
; AVX1: Found an estimated cost of 28 for instruction: %ctlz
; AVX2: Found an estimated cost of 14 for instruction: %ctlz
Expand All @@ -323,7 +323,7 @@ define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {

define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8':
; SSE2: Found an estimated cost of 48 for instruction: %ctlz
; SSE2: Found an estimated cost of 17 for instruction: %ctlz
; SSE42: Found an estimated cost of 9 for instruction: %ctlz
; AVX: Found an estimated cost of 9 for instruction: %ctlz
%ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
Expand All @@ -332,7 +332,7 @@ define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {

define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8u':
; SSE2: Found an estimated cost of 48 for instruction: %ctlz
; SSE2: Found an estimated cost of 17 for instruction: %ctlz
; SSE42: Found an estimated cost of 9 for instruction: %ctlz
; AVX: Found an estimated cost of 9 for instruction: %ctlz
%ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
Expand All @@ -341,7 +341,7 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {

define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8':
; SSE2: Found an estimated cost of 96 for instruction: %ctlz
; SSE2: Found an estimated cost of 34 for instruction: %ctlz
; SSE42: Found an estimated cost of 18 for instruction: %ctlz
; AVX1: Found an estimated cost of 18 for instruction: %ctlz
; AVX2: Found an estimated cost of 9 for instruction: %ctlz
Expand All @@ -351,7 +351,7 @@ define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {

define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8u':
; SSE2: Found an estimated cost of 96 for instruction: %ctlz
; SSE2: Found an estimated cost of 34 for instruction: %ctlz
; SSE42: Found an estimated cost of 18 for instruction: %ctlz
; AVX1: Found an estimated cost of 18 for instruction: %ctlz
; AVX2: Found an estimated cost of 9 for instruction: %ctlz
Expand Down
97 changes: 72 additions & 25 deletions test/CodeGen/X86/vec_ctbits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,42 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
define <2 x i64> @foolz(<2 x i64> %a) nounwind {
; CHECK-LABEL: foolz:
; CHECK: # BB#0:
; CHECK-NEXT: movd %xmm0, %rax
; CHECK-NEXT: bsrq %rax, %rax
; CHECK-NEXT: xorq $63, %rax
; CHECK-NEXT: movd %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %rax
; CHECK-NEXT: bsrq %rax, %rax
; CHECK-NEXT: xorq $63, %rax
; CHECK-NEXT: movd %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
; CHECK-NEXT: por %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psrlq $2, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $4, %xmm1
; CHECK-NEXT: por %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psrlq $8, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $16, %xmm1
; CHECK-NEXT: por %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: psrlq $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubq %xmm0, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
; CHECK-NEXT: movdqa %xmm1, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: psrlq $2, %xmm1
; CHECK-NEXT: pand %xmm0, %xmm1
; CHECK-NEXT: paddq %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm2
; CHECK-NEXT: psrlq $4, %xmm2
; CHECK-NEXT: paddq %xmm1, %xmm2
; CHECK-NEXT: pand {{.*}}(%rip), %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: psadbw %xmm2, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
ret <2 x i64> %c
Expand Down Expand Up @@ -115,21 +140,43 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promlz:
; CHECK: # BB#0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: movd %xmm0, %rax
; CHECK-NEXT: bsrq %rax, %rax
; CHECK-NEXT: movl $127, %ecx
; CHECK-NEXT: cmoveq %rcx, %rax
; CHECK-NEXT: xorq $63, %rax
; CHECK-NEXT: movd %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %rax
; CHECK-NEXT: bsrq %rax, %rax
; CHECK-NEXT: cmoveq %rcx, %rax
; CHECK-NEXT: xorq $63, %rax
; CHECK-NEXT: movd %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: psrlq $1, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $2, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: psrlq $4, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $8, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: psrlq $16, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubq %xmm0, %xmm2
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: psrlq $2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: paddq %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $4, %xmm0
; CHECK-NEXT: paddq %xmm2, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
ret <2 x i32> %c
Expand Down
Loading

0 comments on commit 169b408

Please sign in to comment.