Skip to content

Commit

Permalink
X86: Enable SSE memory intrinsics even when stack alignment is less t…
Browse files Browse the repository at this point in the history
…han 16 bytes.

The stack realignment code was fixed to work when there is stack realignment and
a dynamic alloca is present so this shouldn't cause correctness issues anymore.

Note that this also enables generation of AVX instructions for memset
under the assumptions:
- Unaligned loads/stores are always fast on CPUs supporting AVX
- AVX is not slower than SSE
We may need some tweaked heuristics if one of those assumptions turns out not to
be true.

Effectively reverts r58317. Part of PR2962.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167967 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
d0k committed Nov 14, 2012
1 parent 97d19eb commit 2dbe929
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 29 deletions.
9 changes: 2 additions & 7 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1362,18 +1362,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
bool IsZeroVal,
bool MemcpyStrSrc,
MachineFunction &MF) const {
// FIXME: This turns off use of xmm stores for memset/memcpy on targets like
// linux. This is because the stack realignment code can't handle certain
// cases like PR2962. This should be removed when PR2962 is fixed.
const Function *F = MF.getFunction();
if (IsZeroVal &&
!F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16))) &&
Subtarget->getStackAlignment() >= 16) {
if (Subtarget->getStackAlignment() >= 32) {
(SrcAlign == 0 || SrcAlign >= 16)))) {
if (Size >= 32) {
if (Subtarget->hasAVX2())
return MVT::v8i32;
if (Subtarget->hasAVX())
Expand All @@ -1385,7 +1381,6 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::v4f32;
} else if (!MemcpyStrSrc && Size >= 8 &&
!Subtarget->is64Bit() &&
Subtarget->getStackAlignment() >= 8 &&
Subtarget->hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
Expand Down
22 changes: 0 additions & 22 deletions test/CodeGen/X86/2008-10-27-StackRealignment.ll

This file was deleted.

1 change: 1 addition & 0 deletions test/CodeGen/X86/memcpy-2.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
; RUN: llc < %s -mattr=+sse2 -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
Expand Down
77 changes: 77 additions & 0 deletions test/CodeGen/X86/memset-sse-stack-realignment.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
; Make sure that we realign the stack. Mingw32 uses 4 byte stack alignment, we
; need 16 bytes for SSE and 32 bytes for AVX.

; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s -check-prefix=NOSSE
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s -check-prefix=SSE1
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s -check-prefix=SSE2
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX1
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2

define void @test1(i32 %t) nounwind {
%tmp1210 = alloca i8, i32 32, align 4
call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 32, i32 4, i1 false)
%x = alloca i8, i32 %t
call void @dummy(i8* %x)
ret void

; NOSSE: test1:
; NOSSE-NOT: and
; NOSSE: movl $0

; SSE1: test1:
; SSE1: andl $-16
; SSE1: movl %esp, %esi
; SSE1: movaps

; SSE2: test1:
; SSE2: andl $-16
; SSE2: movl %esp, %esi
; SSE2: movaps

; AVX1: test1:
; AVX1: andl $-32
; AVX1: movl %esp, %esi
; AVX1: vmovaps %ymm

; AVX2: test1:
; AVX2: andl $-32
; AVX2: movl %esp, %esi
; AVX2: vmovaps %ymm

}

define void @test2(i32 %t) nounwind {
%tmp1210 = alloca i8, i32 16, align 4
call void @llvm.memset.p0i8.i64(i8* %tmp1210, i8 0, i64 16, i32 4, i1 false)
%x = alloca i8, i32 %t
call void @dummy(i8* %x)
ret void

; NOSSE: test2:
; NOSSE-NOT: and
; NOSSE: movl $0

; SSE1: test2:
; SSE1: andl $-16
; SSE1: movl %esp, %esi
; SSE1: movaps

; SSE2: test2:
; SSE2: andl $-16
; SSE2: movl %esp, %esi
; SSE2: movaps

; AVX1: test2:
; AVX1: andl $-16
; AVX1: movl %esp, %esi
; AVX1: vmovaps %xmm

; AVX2: test2:
; AVX2: andl $-16
; AVX2: movl %esp, %esi
; AVX2: vmovaps %xmm
}

declare void @dummy(i8*)

declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
1 change: 1 addition & 0 deletions test/CodeGen/X86/memset64-on-x86-32.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10

define void @bork() nounwind {
Expand Down

0 comments on commit 2dbe929

Please sign in to comment.