Skip to content

Commit

Permalink
[X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2
Browse files Browse the repository at this point in the history
This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets.

It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions.

Differential Revision: http://reviews.llvm.org/D7596

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229311 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
RKSimon committed Feb 15, 2015
1 parent 3e93916 commit 28f299b
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 62 deletions.
135 changes: 73 additions & 62 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7834,76 +7834,67 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,

/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
/// byte-shift instructions. The mask must consist of a shifted sequential
/// shuffle from one of the input vectors and zeroable elements for the
/// remaining 'shifted in' elements.
///
/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

int Size = Mask.size();
int Scale = 16 / Size;

for (int Shift = 1; Shift < Size; Shift++) {
int ByteShift = Shift * Scale;

// PSRLDQ : (little-endian) right byte shift
// [ 5, 6, 7, zz, zz, zz, zz, zz]
// [ -1, 5, 6, 7, zz, zz, zz, zz]
// [ 1, 2, -1, -1, -1, -1, zz, zz]
bool ZeroableRight = true;
for (int i = Size - Shift; i < Size; i++) {
ZeroableRight &= Zeroable[i];
}

if (ZeroableRight) {
bool ValidShiftRight1 =
isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
bool ValidShiftRight2 =
isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);

if (ValidShiftRight1 || ValidShiftRight2) {
// Cast the inputs to v2i64 to match PSRLDQ.
SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
DAG.getConstant(ByteShift * 8, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
}
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
int Scale = 16 / NumLaneElts;
MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);

// PSLLDQ : (little-endian) left byte shift
// [ zz, 0, 1, 2, 3, 4, 5, 6]
// [ zz, zz, -1, -1, 2, 3, 4, -1]
// [ zz, zz, zz, zz, zz, zz, -1, 1]
// PSRLDQ : (little-endian) right byte shift
// [ 5, 6, 7, zz, zz, zz, zz, zz]
// [ -1, 5, 6, 7, zz, zz, zz, zz]
// [ 1, 2, -1, -1, -1, -1, zz, zz]
auto MatchByteShift = [&](int Shift) -> SDValue {
bool MatchLeft = true, MatchRight = true;
for (int l = 0; l < NumElts; l += NumLaneElts) {
for (int i = 0; i < Shift; ++i)
MatchLeft &= Zeroable[l + i];
for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
MatchRight &= Zeroable[l + i];
}
if (!(MatchLeft || MatchRight))
return SDValue();

// PSLLDQ : (little-endian) left byte shift
// [ zz, 0, 1, 2, 3, 4, 5, 6]
// [ zz, zz, -1, -1, 2, 3, 4, -1]
// [ zz, zz, zz, zz, zz, zz, -1, 1]
bool ZeroableLeft = true;
for (int i = 0; i < Shift; i++) {
ZeroableLeft &= Zeroable[i];
}

if (ZeroableLeft) {
bool ValidShiftLeft1 =
isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
bool ValidShiftLeft2 =
isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);

if (ValidShiftLeft1 || ValidShiftLeft2) {
// Cast the inputs to v2i64 to match PSLLDQ.
SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
DAG.getConstant(ByteShift * 8, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
}
bool MatchV1 = true, MatchV2 = true;
for (int l = 0; l < NumElts; l += NumLaneElts) {
unsigned Pos = MatchLeft ? Shift + l : l;
unsigned Low = MatchLeft ? l : Shift + l;
unsigned Len = NumLaneElts - Shift;
MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
}
}
if (!(MatchV1 || MatchV2))
return SDValue();

int ByteShift = Shift * Scale;
unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
SDValue V = MatchV1 ? V1 : V2;
V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
V = DAG.getNode(Op, DL, ShiftVT, V,
DAG.getConstant(ByteShift * 8, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, V);
};

for (int Shift = 1; Shift < NumLaneElts; ++Shift)
if (SDValue S = MatchByteShift(Shift))
return S;

// no match
return SDValue();
}

Expand Down Expand Up @@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
}

// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
}

// AVX2 provides a direct instruction for permuting a single input across
Expand All @@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));

// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v4i64, V1, V2, Mask, DAG))
return Shift;

// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
Expand Down Expand Up @@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v8i32, V1, V2, Mask, DAG))
return Shift;

// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i32, V1, V2, Mask, DAG))
return Shift;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
Expand Down Expand Up @@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v16i16, V1, V2, Mask, DAG))
return Shift;

// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v16i16, V1, V2, Mask, DAG))
return Shift;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
Expand Down Expand Up @@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v32i8, V1, V2, Mask, DAG))
return Shift;

// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v32i8, V1, V2, Mask, DAG))
return Shift;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
Expand Down
6 changes: 6 additions & 0 deletions lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in {
(VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
(VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;

// Shift up / down and insert zero's.
def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))),
(VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))),
(VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
}

let Predicates = [UseSSE2] in {
Expand Down
34 changes: 34 additions & 0 deletions test/CodeGen/X86/vector-shuffle-256-v16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1349,6 +1349,40 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2
ret <16 x i16> %shuffle
}

define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
; AVX1: # BB#0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
; AVX2: # BB#0:
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
ret <16 x i16> %shuffle
}

define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
; AVX1: # BB#0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
; AVX2: # BB#0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
ret <16 x i16> %shuffle
}

;
; Shuffle to logical bit shifts
;
Expand Down
34 changes: 34 additions & 0 deletions test/CodeGen/X86/vector-shuffle-256-v32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,40 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
ret <32 x i8> %shuffle
}

define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
; AVX1: # BB#0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
; AVX2: # BB#0:
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
ret <32 x i8> %shuffle
}

define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX1: # BB#0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX2: # BB#0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}

;
; Shuffle to logical bit shifts
;
Expand Down
30 changes: 30 additions & 0 deletions test/CodeGen/X86/vector-shuffle-256-v4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
ret <4 x i64> %shuffle
}

define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
; AVX1-LABEL: shuffle_v4i64_z4z6:
; AVX1: # BB#0:
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_z4z6:
; AVX2: # BB#0:
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
ret <4 x i64> %shuffle
}

define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
; AVX1-LABEL: shuffle_v4i64_5zuz:
; AVX1: # BB#0:
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_5zuz:
; AVX2: # BB#0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
ret <4 x i64> %shuffle
}

define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
; ALL-LABEL: stress_test1:
; ALL: retq
Expand Down
31 changes: 31 additions & 0 deletions test/CodeGen/X86/vector-shuffle-256-v8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1815,6 +1815,37 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
ret <8 x i32> %shuffle
}

define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_zuu8zuuc:
; AVX2: # BB#0:
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 8, i32 0, i32 undef, i32 undef, i32 12>
ret <8 x i32> %shuffle
}

define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_9ubzdefz:
; AVX2: # BB#0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 9, i32 undef, i32 11, i32 0, i32 13, i32 14, i32 15, i32 0>
ret <8 x i32> %shuffle
}

define <8 x float> @splat_mem_v8f32_2(float* %p) {
; ALL-LABEL: splat_mem_v8f32_2:
; ALL: # BB#0:
Expand Down

0 comments on commit 28f299b

Please sign in to comment.