diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index e20ec8221d3d..5e7def9150e9 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -1,16 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 + ; Tests for SSE2 and below, without SSE3+. -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+sse2 -O3 | FileCheck %s define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { -; CHECK-LABEL: test1: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test1: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movapd (%ecx), %xmm0 +; X86-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; X86-NEXT: movapd %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movapd (%rsi), %xmm1 +; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X64-NEXT: movapd %xmm1, (%rdi) +; X64-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > @@ -19,14 +28,21 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { } define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { -; CHECK-LABEL: test2: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movapd (%ecx), %xmm0 +; X86-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-NEXT: movapd %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movapd (%rsi), %xmm1 +; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: movapd %xmm1, (%rdi) +; X64-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > @@ -36,15 +52,22 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { -; CHECK-LABEL: test3: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movaps (%edx), %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test3: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movaps (%edx), %xmm0 +; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: movaps (%rsi), %xmm0 +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; [#uses=1] @@ -60,27 +83,42 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind } define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { -; CHECK-LABEL: test4: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; CHECK-NEXT: movaps %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test4: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test4: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res ret void } define <4 x i32> @test5(i8** %ptr) nounwind { -; CHECK-LABEL: test5: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl (%eax), %eax -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-NEXT: retl +; X86-LABEL: test5: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: retl +; +; X64-LABEL: test5: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: retq %tmp = load i8*, i8** %ptr ; [#uses=1] %tmp.upgrd.1 = bitcast i8* %tmp to float* ; [#uses=1] %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; [#uses=1] @@ -97,13 +135,19 @@ define <4 x i32> @test5(i8** %ptr) nounwind { } define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { -; CHECK-LABEL: test6: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movaps (%ecx), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test6: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movaps (%ecx), %xmm0 +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test6: +; X64: # BB#0: +; X64-NEXT: movaps (%rsi), %xmm0 +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp2, <4 x float>* %res @@ -111,11 +155,17 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { } define void @test7() nounwind { -; CHECK-LABEL: test7: -; CHECK: ## BB#0: -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movaps %xmm0, 0 -; CHECK-NEXT: retl +; X86-LABEL: test7: +; X86: # BB#0: +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, 0 +; X86-NEXT: retl +; +; X64-LABEL: test7: +; X64: # BB#0: +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 0 +; X64-NEXT: retq bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] store <4 x float> %2, <4 x float>* null @@ -125,11 +175,15 @@ define void @test7() nounwind { @x = external global [4 x i32] define <2 x i64> @test8() nounwind { -; CHECK-LABEL: test8: -; CHECK: ## BB#0: -; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax -; CHECK-NEXT: movups (%eax), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test8: +; X86: # BB#0: +; X86-NEXT: movups x, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test8: +; X64: # BB#0: +; X64-NEXT: movups {{.*}}(%rip), %xmm0 +; X64-NEXT: retq %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; [#uses=1] %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; [#uses=1] %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; [#uses=1] @@ -143,10 +197,17 @@ define <2 x i64> @test8() nounwind { } define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { -; CHECK-LABEL: test9: -; CHECK: ## BB#0: -; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test9: +; X86: # BB#0: +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test9: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: retq %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] @@ -155,10 +216,17 @@ define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) no } define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { -; CHECK-LABEL: test10: -; CHECK: ## BB#0: -; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test10: +; X86: # BB#0: +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test10: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: retq %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] @@ -167,26 +235,42 @@ define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { } define <2 x double> @test11(double %a, double %b) nounwind { -; CHECK-LABEL: test11: -; CHECK: ## BB#0: -; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test11: +; X86: # BB#0: +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test11: +; X64: # BB#0: +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] ret <2 x double> %tmp7 } define void @test12() nounwind { -; CHECK-LABEL: test12: -; CHECK: ## BB#0: -; CHECK-NEXT: movapd 0, %xmm0 -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; CHECK-NEXT: addps %xmm1, %xmm2 -; CHECK-NEXT: movaps %xmm2, 0 -; CHECK-NEXT: retl +; X86-LABEL: test12: +; X86: # BB#0: +; X86-NEXT: movapd 0, %xmm0 +; X86-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X86-NEXT: xorps %xmm2, %xmm2 +; X86-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X86-NEXT: addps %xmm1, %xmm2 +; X86-NEXT: movaps %xmm2, 0 +; X86-NEXT: retl +; +; X64-LABEL: test12: +; X64: # BB#0: +; X64-NEXT: movapd 0, %xmm0 +; X64-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X64-NEXT: addps %xmm1, %xmm2 +; X64-NEXT: movaps %xmm2, 0 +; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] @@ -196,16 +280,24 @@ define void @test12() nounwind { } define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { -; CHECK-LABEL: test13: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movaps (%edx), %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; CHECK-NEXT: movaps %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test13: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movaps (%edx), %xmm0 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test13: +; X64: # BB#0: +; X64-NEXT: movaps (%rdx), %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1] %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1] %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] @@ -214,17 +306,27 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl } define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { -; CHECK-LABEL: test14: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movaps (%ecx), %xmm1 -; CHECK-NEXT: movaps (%eax), %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm0 -; CHECK-NEXT: addps %xmm1, %xmm0 -; CHECK-NEXT: subps %xmm1, %xmm2 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: retl +; X86-LABEL: test14: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movaps (%ecx), %xmm1 +; X86-NEXT: movaps (%eax), %xmm2 +; X86-NEXT: movaps %xmm2, %xmm0 +; X86-NEXT: addps %xmm1, %xmm0 +; X86-NEXT: subps %xmm1, %xmm2 +; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-NEXT: retl +; +; X64-LABEL: test14: +; X64: # BB#0: +; X64-NEXT: movaps (%rsi), %xmm1 +; X64-NEXT: movaps (%rdi), %xmm2 +; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: subps %xmm1, %xmm2 +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: retq %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2] %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2] %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] @@ -234,13 +336,19 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { } define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { -; CHECK-LABEL: test15: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-NEXT: retl +; X86-LABEL: test15: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movapd (%ecx), %xmm0 +; X86-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X86-NEXT: retl +; +; X64-LABEL: test15: +; X64: # BB#0: # %entry +; X64-NEXT: movapd (%rdi), %xmm0 +; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X64-NEXT: retq entry: %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1] %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1] @@ -251,12 +359,18 @@ entry: ; PR8900 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { -; CHECK-LABEL: test16: -; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movapd 96(%eax), %xmm0 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: retl +; X86-LABEL: test16: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movapd 96(%eax), %xmm0 +; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-NEXT: retl +; +; X64-LABEL: test16: +; X64: # BB#0: +; X64-NEXT: movapd 96(%rdi), %xmm0 +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-NEXT: retq %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 %i6 = load <4 x double>, <4 x double>* %i5, align 32 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> @@ -265,11 +379,17 @@ define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap ; PR9009 define fastcc void @test17() nounwind { -; CHECK-LABEL: test17: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movaps {{.*#+}} xmm0 = -; CHECK-NEXT: movaps %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test17: +; X86: # BB#0: # %entry +; X86-NEXT: movaps {{.*#+}} xmm0 = +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test17: +; X64: # BB#0: # %entry +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: movaps %xmm0, (%rax) +; X64-NEXT: retq entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 %1 = shufflevector <4 x i32> , <4 x i32> %0, <4 x i32> @@ -280,31 +400,52 @@ entry: ; PR9210 define <4 x float> @f(<4 x double>) nounwind { -; CHECK-LABEL: f: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 -; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: retl +; X86-LABEL: f: +; X86: # BB#0: # %entry +; X86-NEXT: cvtpd2ps %xmm1, %xmm1 +; X86-NEXT: cvtpd2ps %xmm0, %xmm0 +; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: retl +; +; X64-LABEL: f: +; X64: # BB#0: # %entry +; X64-NEXT: cvtpd2ps %xmm1, %xmm1 +; X64-NEXT: cvtpd2ps %xmm0, %xmm0 +; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq entry: %double2float.i = fptrunc <4 x double> %0 to <4 x float> ret <4 x float> %double2float.i } define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext: -; CHECK: ## BB#0: -; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; CHECK-NEXT: retl +; X86-LABEL: test_insert_64_zext: +; X86: # BB#0: +; X86-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NEXT: retl +; +; X64-LABEL: test_insert_64_zext: +; X64: # BB#0: +; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X64-NEXT: retq %1 = shufflevector <2 x i64> %i, <2 x i64> , <2 x i32> ret <2 x i64> %1 } define <4 x i32> @PR19721(<4 x i32> %i) { -; CHECK-LABEL: PR19721: -; CHECK: ## BB#0: -; CHECK-NEXT: andps LCPI19_0, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: PR19721: +; X86: # BB#0: +; X86-NEXT: andps {{\.LCPI.*}}, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: PR19721: +; X64: # BB#0: +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm1 +; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-NEXT: retq %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 %bc2 = bitcast i128 %insert to <4 x i32> @@ -312,16 +453,27 @@ define <4 x i32> @PR19721(<4 x i32> %i) { } define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: test_mul: -; CHECK: ## BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: retl +; X86-LABEL: test_mul: +; X86: # BB#0: +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; +; X64-LABEL: test_mul: +; X64: # BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-NEXT: pmuludq %xmm2, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: retq %m = mul <4 x i32> %x, %y ret <4 x i32> %m } diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 79b949a6ccb1..1e7b9da6a321 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -1,20 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; These are tests for SSE3 codegen. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64 +; These are tests for SSE3 codegen. ; Test for v8xi16 lowering where we extract the first element of the vector and ; placed it in the second element of the result. define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { +; X86-LABEL: t0: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X86-NEXT: movdqa %xmm0, (%eax) +; X86-NEXT: retl +; ; X64-LABEL: t0: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: movl $1, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp3 = load <8 x i16>, <8 x i16>* %old %tmp6 = shufflevector <8 x i16> %tmp3, @@ -25,15 +35,25 @@ entry: } define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; X86-LABEL: t1: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] +; X86-NEXT: movaps %xmm0, %xmm1 +; X86-NEXT: andnps (%ecx), %xmm1 +; X86-NEXT: andps (%eax), %xmm0 +; X86-NEXT: orps %xmm1, %xmm0 +; X86-NEXT: retl +; ; X64-LABEL: t1: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; X64-NEXT: movaps %xmm0, %xmm1 ; X64-NEXT: andnps (%rsi), %xmm1 ; X64-NEXT: andps (%rdi), %xmm0 ; X64-NEXT: orps %xmm1, %xmm0 ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > @@ -42,86 +62,131 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t2: +; X86: # BB#0: +; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7] +; X86-NEXT: pandn %xmm1, %xmm2 +; X86-NEXT: por %xmm2, %xmm0 +; X86-NEXT: retl +; ; X64-LABEL: t2: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] ; X64-NEXT: pand %xmm2, %xmm0 ; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7] ; X64-NEXT: pandn %xmm1, %xmm2 ; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp } define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t3: +; X86: # BB#0: +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; X86-NEXT: retl +; ; X64-LABEL: t3: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > ret <8 x i16> %tmp } define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t4: +; X86: # BB#0: +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] +; X86-NEXT: retl +; ; X64-LABEL: t4: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > ret <8 x i16> %tmp } define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t5: +; X86: # BB#0: +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: retl +; ; X64-LABEL: t5: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > ret <8 x i16> %tmp } define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t6: +; X86: # BB#0: +; X86-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-NEXT: retl +; ; X64-LABEL: t6: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp } define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind { +; X86-LABEL: t7: +; X86: # BB#0: +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; X86-NEXT: retl +; ; X64-LABEL: t7: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > ret <8 x i16> %tmp } define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { +; X86-LABEL: t8: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X86-NEXT: movdqa %xmm0, (%eax) +; X86-NEXT: retl +; ; X64-LABEL: t8: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = load <2 x i64>, <2 x i64>* %A %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> %tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 @@ -146,13 +211,21 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { } define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { +; X86-LABEL: t9: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movapd (%ecx), %xmm0 +; X86-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-NEXT: movapd %xmm0, (%ecx) +; X86-NEXT: retl +; ; X64-LABEL: t9: -; X64: ## BB#0: +; X64: # BB#0: ; X64-NEXT: movapd (%rdi), %xmm0 ; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X64-NEXT: movapd %xmm0, (%rdi) ; X64-NEXT: retq -; X64-NEXT: ## -- End function %tmp = load <4 x float>, <4 x float>* %r %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* %tmp.upgrd.4 = load double, double* %tmp.upgrd.3 @@ -180,16 +253,21 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { @g2 = external constant <4 x i16> define void @t10() nounwind { +; X86-LABEL: t10: +; X86: # BB#0: +; X86-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: movq %xmm0, g2 +; X86-NEXT: retl +; ; X64-LABEL: t10: -; X64: ## BB#0: -; X64-NEXT: movq _g1@{{.*}}(%rip), %rax +; X64: # BB#0: ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq _g2@{{.*}}(%rip), %rax -; X64-NEXT: movq %xmm0, (%rax) +; X64-NEXT: movq %xmm0, {{.*}}(%rip) ; X64-NEXT: retq -; X64-NEXT: ## -- End function load <4 x i32>, <4 x i32>* @g1, align 16 bitcast <4 x i32> %1 to <8 x i16> shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > @@ -202,12 +280,17 @@ define void @t10() nounwind { ; Pack various elements via shuffles. define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X86-LABEL: t11: +; X86: # BB#0: # %entry +; X86-NEXT: psrld $16, %xmm0 +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: retl +; ; X64-LABEL: t11: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: psrld $16, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp7 @@ -215,13 +298,19 @@ entry: } define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X86-LABEL: t12: +; X86: # BB#0: # %entry +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; X86-NEXT: retl +; ; X64-LABEL: t12: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > ret <8 x i16> %tmp9 @@ -229,26 +318,38 @@ entry: } define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X86-LABEL: t13: +; X86: # BB#0: # %entry +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; X86-NEXT: retl +; ; X64-LABEL: t13: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > ret <8 x i16> %tmp9 } define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X86-LABEL: t14: +; X86: # BB#0: # %entry +; X86-NEXT: psrlq $16, %xmm0 +; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: retl +; ; X64-LABEL: t14: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: psrlq $16, %xmm0 ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > ret <8 x i16> %tmp9 @@ -256,13 +357,19 @@ entry: ; FIXME: t15 is worse off from disabling of scheduler 2-address hack. define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X86-LABEL: t15: +; X86: # BB#0: # %entry +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] +; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: retl +; ; X64-LABEL: t15: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp8 @@ -270,13 +377,19 @@ entry: ; Test yonah where we convert a shuffle to pextrw and pinrsw define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { +; X86-LABEL: t16: +; X86: # BB#0: # %entry +; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: retl +; ; X64-LABEL: t16: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp8 = shufflevector <16 x i8> , <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > @@ -285,14 +398,21 @@ entry: ; rdar://8520311 define <4 x i32> @t17() nounwind { +; X86-LABEL: t17: +; X86: # BB#0: # %entry +; X86-NEXT: movaps (%eax), %xmm0 +; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X86-NEXT: pxor %xmm1, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: retl +; ; X64-LABEL: t17: -; X64: ## BB#0: ## %entry +; X64: # BB#0: # %entry ; X64-NEXT: movaps (%rax), %xmm0 ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq -; X64-NEXT: ## -- End function entry: %tmp1 = load <4 x float>, <4 x float>* undef, align 16 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32>