Skip to content

Commit

Permalink
[X86][SSE] Prefer to combine shuffles to VZEXT over VZEXT_MOVL.
Browse files Browse the repository at this point in the history
This matches what is already done during shuffle lowering and helps prevent the need for a zero-vector in cases where shuffles match both patterns.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295723 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
RKSimon committed Feb 21, 2017
1 parent 70dcac1 commit d14347a
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 29 deletions.
18 changes: 9 additions & 9 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26387,15 +26387,6 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
isUndefOrEqual(Mask[0], 0) &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
}

// Match against a VZEXT instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
Expand All @@ -26421,6 +26412,15 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
isUndefOrEqual(Mask[0], 0) &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
}

// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
Expand Down
25 changes: 5 additions & 20 deletions test/CodeGen/X86/vector-shuffle-combining-sse41.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,16 @@

declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)

; FIXME: We can avoid the zero vector generation if we use PMOVZX instead
define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
; SSE-LABEL: combine_vpshufb_as_movzx:
; SSE: # BB#0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vpshufb_as_movzx:
; AVX1: # BB#0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vpshufb_as_movzx:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_vpshufb_as_movzx:
; AVX512F: # BB#0:
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: retq
; AVX-LABEL: combine_vpshufb_as_movzx:
; AVX: # BB#0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 -1, i8 -1, i8 -1, i8 -1>)
ret <16 x i8> %res0
}

0 comments on commit d14347a

Please sign in to comment.