Skip to content

Commit

Permalink
[X86][SSE] Support for MOVMSK signbit extraction instructions
Browse files Browse the repository at this point in the history
Add support for lowering with the MOVMSK instruction to extract vector element signbits to a GPR.

This is an early step towards more optimal handling of vector comparison results.

Differential Revision: http://reviews.llvm.org/D18741

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@265266 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
RKSimon committed Apr 3, 2016
1 parent ad9ddc5 commit baf1e7a
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 45 deletions.
31 changes: 4 additions & 27 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21888,6 +21888,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::AND: return "X86ISD::AND";
case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
case X86ISD::TESTM: return "X86ISD::TESTM";
Expand Down Expand Up @@ -24018,33 +24019,9 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::SETCC:
KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
break;
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned NumLoBits = 0;
switch (IntId) {
default: break;
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_avx_movmsk_ps_256:
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_avx_movmsk_pd_256:
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse2_pmovmskb_128:
case Intrinsic::x86_avx2_pmovmskb: {
// High bits of movmskp{s|d}, pmovmskb are known zero.
switch (IntId) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break;
}
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
break;
}
}
case X86ISD::MOVMSK: {
unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
break;
}
}
Expand Down
3 changes: 3 additions & 0 deletions lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ namespace llvm {
// X86-specific multiply by immediate.
MUL_IMM,

// Vector sign bit extraction.
MOVMSK,

// Vector bitwise comparisons.
PTEST,

Expand Down
3 changes: 3 additions & 0 deletions lib/Target/X86/X86InstrFragmentsSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,9 @@ def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
def X86testm : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;

def X86movmsk : SDNode<"X86ISD::MOVMSK",
SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;

def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>;

def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
Expand Down
34 changes: 16 additions & 18 deletions lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -2765,25 +2765,23 @@ let Predicates = [HasAVX1Only] in {
//===----------------------------------------------------------------------===//

/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
Domain d> {
multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
string asm, Domain d> {
def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
[(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
Sched<[WriteVecLogic]>;
}

let Predicates = [HasAVX] in {
defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
"movmskps", SSEPackedSingle>, PS, VEX;
defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
"movmskpd", SSEPackedDouble>, PD, VEX;
defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
"movmskps", SSEPackedSingle>, PS,
VEX, VEX_L;
defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
"movmskpd", SSEPackedDouble>, PD,
VEX, VEX_L;
defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
SSEPackedSingle>, PS, VEX;
defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
SSEPackedDouble>, PD, VEX;
defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
SSEPackedSingle>, PS, VEX, VEX_L;
defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
SSEPackedDouble>, PD, VEX, VEX_L;

def : Pat<(i32 (X86fgetsign FR32:$src)),
(VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
Expand All @@ -2797,9 +2795,9 @@ let Predicates = [HasAVX] in {
(VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
}

defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
SSEPackedSingle>, PS;
defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
SSEPackedDouble>, PD;

def : Pat<(i32 (X86fgetsign FR32:$src)),
Expand Down Expand Up @@ -4665,20 +4663,20 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
[(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
IIC_SSE_MOVMSK>, VEX;

let Predicates = [HasAVX2] in {
def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR256:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
[(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
VEX, VEX_L;
}

def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
[(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
IIC_SSE_MOVMSK>;

} // ExeDomain = SSEPackedInt
Expand Down
6 changes: 6 additions & 0 deletions lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
Expand Down Expand Up @@ -354,6 +356,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
Expand Down Expand Up @@ -2184,6 +2187,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
Expand All @@ -2201,6 +2205,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
Expand All @@ -2210,6 +2215,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
Expand Down

0 comments on commit baf1e7a

Please sign in to comment.