From 2dbe929685da37e904d6bb0c5a3504e1bafe348f Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 14 Nov 2012 20:08:40 +0000 Subject: [PATCH] X86: Enable SSE memory intrinsics even when stack alignment is less than 16 bytes. The stack realignment code was fixed to work when there is stack realignment and a dynamic alloca is present so this shouldn't cause correctness issues anymore. Note that this also enables generation of AVX instructions for memset under the assumptions: - Unaligned loads/stores are always fast on CPUs supporting AVX - AVX is not slower than SSE We may need some tweaked heuristics if one of those assumptions turns out not to be true. Effectively reverts r58317. Part of PR2962. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167967 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 9 +-- .../X86/2008-10-27-StackRealignment.ll | 22 ------ test/CodeGen/X86/memcpy-2.ll | 1 + .../X86/memset-sse-stack-realignment.ll | 77 +++++++++++++++++++ test/CodeGen/X86/memset64-on-x86-32.ll | 1 + 5 files changed, 81 insertions(+), 29 deletions(-) delete mode 100644 test/CodeGen/X86/2008-10-27-StackRealignment.ll create mode 100644 test/CodeGen/X86/memset-sse-stack-realignment.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 863379ec5af4..53a095f7180f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1362,18 +1362,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const { - // FIXME: This turns off use of xmm stores for memset/memcpy on targets like - // linux. This is because the stack realignment code can't handle certain - // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); if (IsZeroVal && !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16))) && - Subtarget->getStackAlignment() >= 16) { - if (Subtarget->getStackAlignment() >= 32) { + (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Size >= 32) { if (Subtarget->hasAVX2()) return MVT::v8i32; if (Subtarget->hasAVX()) @@ -1385,7 +1381,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && - Subtarget->getStackAlignment() >= 8 && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. diff --git a/test/CodeGen/X86/2008-10-27-StackRealignment.ll b/test/CodeGen/X86/2008-10-27-StackRealignment.ll deleted file mode 100644 index a57f7166cadc..000000000000 --- a/test/CodeGen/X86/2008-10-27-StackRealignment.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Linux doesn't support stack realignment for functions with allocas (PR2888). -; Until it does, we shouldn't use movaps to access the stack. On targets with -; sufficiently aligned stack (e.g. darwin) we should. -; PR8969 - make 32-bit linux have a 16-byte aligned stack -; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=yonah | grep movaps | count 2 -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=yonah | grep movaps | count 2 - - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-pc-linux-gnu" - -define void @foo(i32 %t) nounwind { - %tmp1210 = alloca i8, i32 32, align 4 - call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false) - %x = alloca i8, i32 %t - call void @dummy(i8* %x) - ret void -} - -declare void @dummy(i8*) - -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index eae2e708349c..7a2bbc4ef0fb 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -mattr=+sse2 -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1 ; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64 diff --git a/test/CodeGen/X86/memset-sse-stack-realignment.ll b/test/CodeGen/X86/memset-sse-stack-realignment.ll new file mode 100644 index 000000000000..df9de5dfaf22 --- /dev/null +++ b/test/CodeGen/X86/memset-sse-stack-realignment.ll @@ -0,0 +1,77 @@ +; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we +; need 16 bytes for SSE and 32 bytes for AVX. + +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1 +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1 +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2 + +define void @test1(i32 %t) nounwind { + %tmp1210 = alloca i8, i32 32, align 4 + call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false) + %x = alloca i8, i32 %t + call void @dummy(i8* %x) + ret void + +; NOSSE: test1: +; NOSSE-NOT: and +; NOSSE: movl $0 + +; SSE1: test1: +; SSE1: andl $-16 +; SSE1: movl %esp, %esi +; SSE1: movaps + +; SSE2: test1: +; SSE2: andl $-16 +; SSE2: movl %esp, %esi +; SSE2: movaps + +; AVX1: test1: +; AVX1: andl $-32 +; AVX1: movl %esp, %esi +; AVX1: vmovaps %ymm + +; AVX2: test1: +; AVX2: andl $-32 +; AVX2: movl %esp, %esi +; AVX2: vmovaps %ymm + +} + +define void @test2(i32 %t) nounwind { + %tmp1210 = alloca i8, i32 16, align 4 + call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false) + %x = alloca i8, i32 %t + call void @dummy(i8* %x) + ret void + +; NOSSE: test2: +; NOSSE-NOT: and +; NOSSE: movl $0 + +; SSE1: test2: +; SSE1: andl $-16 +; SSE1: movl %esp, %esi +; SSE1: movaps + +; SSE2: test2: +; SSE2: andl $-16 +; SSE2: movl %esp, %esi +; SSE2: movaps + +; AVX1: test2: +; AVX1: andl $-16 +; AVX1: movl %esp, %esi +; AVX1: vmovaps %xmm + +; AVX2: test2: +; AVX2: andl $-16 +; AVX2: movl %esp, %esi +; AVX2: vmovaps %xmm +} + +declare void @dummy(i8*) + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll index e20fce172f27..8cfa032797f7 100644 --- a/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/test/CodeGen/X86/memset64-on-x86-32.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20 +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10 define void @bork() nounwind {