X86: Enable SSE memory intrinsics even when stack alignment is less t…

…han 16 bytes. The stack realignment code was fixed to work when there is stack realignment and a dynamic alloca is present so this shouldn't cause correctness issues anymore. Note that this also enables generation of AVX instructions for memset under the assumptions: - Unaligned loads/stores are always fast on CPUs supporting AVX - AVX is not slower than SSE We may need some tweaked heuristics if one of those assumptions turns out not to be true. Effectively reverts r58317. Part of PR2962. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167967 91177308-0d34-0410-b5e6-96231b3b80d8
zoren · Nov 14, 2012 · 2dbe929 · 2dbe929
1 parent 97d19eb
commit 2dbe929
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 29 deletions.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
@@ -1362,18 +1362,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        bool IsZeroVal,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
-  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
-  // linux.  This is because the stack realignment code can't handle certain
-  // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = MF.getFunction();
   if (IsZeroVal &&
       !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
-          (SrcAlign == 0 || SrcAlign >= 16))) &&
-        Subtarget->getStackAlignment() >= 16) {
-      if (Subtarget->getStackAlignment() >= 32) {
+          (SrcAlign == 0 || SrcAlign >= 16)))) {
+      if (Size >= 32) {
         if (Subtarget->hasAVX2())
           return MVT::v8i32;
         if (Subtarget->hasAVX())
@@ -1385,7 +1381,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
         return MVT::v4f32;
     } else if (!MemcpyStrSrc && Size >= 8 &&
                !Subtarget->is64Bit() &&
-               Subtarget->getStackAlignment() >= 8 &&
                Subtarget->hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.

diff --git a/test/CodeGen/X86/2008-10-27-StackRealignment.ll b/test/CodeGen/X86/2008-10-27-StackRealignment.ll
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
 ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
 ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64

diff --git a/test/CodeGen/X86/memset-sse-stack-realignment.ll b/test/CodeGen/X86/memset-sse-stack-realignment.ll
@@ -0,0 +1,77 @@
+; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we
+; need 16 bytes for SSE and 32 bytes for AVX.
+
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
+
+define void @test1(i32 %t) nounwind {
+  %tmp1210 = alloca i8, i32 32, align 4
+  call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
+  %x = alloca i8, i32 %t
+  call void @dummy(i8* %x)
+  ret void
+
+; NOSSE: test1:
+; NOSSE-NOT: and
+; NOSSE: movl $0
+
+; SSE1: test1:
+; SSE1: andl $-16
+; SSE1: movl %esp, %esi
+; SSE1: movaps
+
+; SSE2: test1:
+; SSE2: andl $-16
+; SSE2: movl %esp, %esi
+; SSE2: movaps
+
+; AVX1: test1:
+; AVX1: andl $-32
+; AVX1: movl %esp, %esi
+; AVX1: vmovaps %ymm
+
+; AVX2: test1:
+; AVX2: andl $-32
+; AVX2: movl %esp, %esi
+; AVX2: vmovaps %ymm
+
+}
+
+define void @test2(i32 %t) nounwind {
+  %tmp1210 = alloca i8, i32 16, align 4
+  call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false)
+  %x = alloca i8, i32 %t
+  call void @dummy(i8* %x)
+  ret void
+
+; NOSSE: test2:
+; NOSSE-NOT: and
+; NOSSE: movl $0
+
+; SSE1: test2:
+; SSE1: andl $-16
+; SSE1: movl %esp, %esi
+; SSE1: movaps
+
+; SSE2: test2:
+; SSE2: andl $-16
+; SSE2: movl %esp, %esi
+; SSE2: movaps
+
+; AVX1: test2:
+; AVX1: andl $-16
+; AVX1: movl %esp, %esi
+; AVX1: vmovaps %xmm
+
+; AVX2: test2:
+; AVX2: andl $-16
+; AVX2: movl %esp, %esi
+; AVX2: vmovaps %xmm
+}
+
+declare void @dummy(i8*)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=nehalem | grep movups | count 5
 ; RUN: llc < %s -mtriple=i386-apple-darwin   -mcpu=core2   | grep movl   | count 20
+; RUN: llc < %s -mtriple=i386-pc-mingw32   -mcpu=core2   | grep movl   | count 20
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2   | grep movq   | count 10
 
 define void @bork() nounwind {