Skip to content

Commit

Permalink
LoadStoreVectorizer: Split even sized illegal chains properly
Browse files Browse the repository at this point in the history
Implement isLegalToVectorizeLoadChain for AMDGPU to avoid
producing private address spaces accesses that will need to be
split up later. This was doing the wrong thing in the case
where the queried chain was an even number of elements.

A possible <4 x i32> store was being split into
store <2 x i32>
store i32
store i32

rather than
store <2 x i32>
store <2 x i32>

when legal.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295933 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Feb 23, 2017
1 parent 5990032 commit 210095c
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 9 deletions.
25 changes: 25 additions & 0 deletions lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,31 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
}
}

bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}
return true;
}

bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}

bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}

unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Semi-arbitrary large amount.
return 64;
Expand Down
11 changes: 11 additions & 0 deletions lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;

bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned Alignment,
unsigned AddrSpace) const;

unsigned getMaxInterleaveFactor(unsigned VF);

int getArithmeticInstrCost(
Expand Down
9 changes: 6 additions & 3 deletions lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
unsigned ElementSizeBytes = ElementSizeBits / 8;
unsigned SizeBytes = ElementSizeBytes * Chain.size();
unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
if (NumLeft == Chain.size())
--NumLeft;
else if (NumLeft == 0)
if (NumLeft == Chain.size()) {
if ((NumLeft & 1) == 0)
NumLeft /= 2; // Split even in half
else
--NumLeft; // Split off last element
} else if (NumLeft == 0)
NumLeft = 1;
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32
; ALL: alloca [128 x i32], align 16

; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}}

; FIXME: Should change alignment
; ALIGNED: load i32
; ALIGNED: load i32
define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
%alloca = alloca [128 x i32], align 16
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
Expand Down
186 changes: 181 additions & 5 deletions test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s

target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

Expand All @@ -28,6 +29,60 @@ define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
; ALIGNED: store i32 9, i32* %out, align 1
; ALIGNED: store i32 1, i32* %out.gep.1, align 1
; ALIGNED: store i32 23, i32* %out.gep.2, align 1
; ALIGNED: store i32 19, i32* %out.gep.3, align 1

; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 1

; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32>* %1, align 1
; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32>* %2, align 1

; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
%out.gep.1 = getelementptr i32, i32* %out, i32 1
%out.gep.2 = getelementptr i32, i32* %out, i32 2
%out.gep.3 = getelementptr i32, i32* %out, i32 3

store i32 9, i32* %out, align 1
store i32 1, i32* %out.gep.1, align 1
store i32 23, i32* %out.gep.2, align 1
store i32 19, i32* %out.gep.3, align 1
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
; ALIGNED: store i32 9, i32* %out, align 2
; ALIGNED: store i32 1, i32* %out.gep.1, align 2
; ALIGNED: store i32 23, i32* %out.gep.2, align 2
; ALIGNED: store i32 19, i32* %out.gep.3, align 2

; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 2

; ELT8-UNALIGNED: store <2 x i32>
; ELT8-UNALIGNED: store <2 x i32>

; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
%out.gep.1 = getelementptr i32, i32* %out, i32 1
%out.gep.2 = getelementptr i32, i32* %out, i32 2
%out.gep.3 = getelementptr i32, i32* %out, i32 3

store i32 9, i32* %out, align 2
store i32 1, i32* %out.gep.1, align 2
store i32 23, i32* %out.gep.2, align 2
store i32 19, i32* %out.gep.3, align 2
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
; ALL: store <4 x i8>
define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
Expand All @@ -42,6 +97,25 @@ define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
; ALIGNED: store i8
; ALIGNED: store i8
; ALIGNED: store i8
; ALIGNED: store i8

; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
%out.gep.1 = getelementptr i8, i8* %out, i32 1
%out.gep.2 = getelementptr i8, i8* %out, i32 2
%out.gep.3 = getelementptr i8, i8* %out, i32 3

store i8 9, i8* %out, align 1
store i8 1, i8* %out.gep.1, align 1
store i8 23, i8* %out.gep.2, align 1
store i8 19, i8* %out.gep.3, align 1
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
; ALL: store <2 x i16>
define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
Expand All @@ -52,4 +126,106 @@ define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
; ALIGNED: store i16
; ALIGNED: store i16

; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
%out.gep.1 = getelementptr i16, i16* %out, i32 1

store i16 9, i16* %out, align 2
store i16 12, i16* %out.gep.1, align 2
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
; ALIGNED: store i16
; ALIGNED: store i16

; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
%out.gep.1 = getelementptr i16, i16* %out, i32 1

store i16 9, i16* %out, align 1
store i16 12, i16* %out.gep.1, align 1
ret void
}

; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
%out.gep.1 = getelementptr i16, i16* %out, i32 1

store i16 9, i16* %out, align 8
store i16 12, i16* %out.gep.1, align 2
ret void
}

; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
; ELT4: store i32
; ELT4: store i32
; ELT4: store i32

; ELT8-ALIGNED: store i32
; ELT8-ALIGNED: store i32
; ELT8-ALIGNED: store i32

; ELT8-UNALIGNED: store <2 x i32>
; ELT8-UNALIGNED: store i32

; ELT16-ALIGNED: store i32
; ELT16-ALIGNED: store i32
; ELT16-ALIGNED: store i32

; ELT16-UNALIGNED: store <3 x i32>
define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
%out.gep.1 = getelementptr i32, i32* %out, i32 1
%out.gep.2 = getelementptr i32, i32* %out, i32 2

store i32 9, i32* %out
store i32 1, i32* %out.gep.1
store i32 23, i32* %out.gep.2
ret void
}

; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
; ALIGNED: store i32
; ALIGNED: store i32
; ALIGNED: store i32

; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32
; ELT4-UNALIGNED: store i32

; ELT8-UNALIGNED: store <2 x i32>
; ELT8-UNALIGNED: store i32

; ELT16-UNALIGNED: store <3 x i32>
define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
%out.gep.1 = getelementptr i32, i32* %out, i32 1
%out.gep.2 = getelementptr i32, i32* %out, i32 2

store i32 9, i32* %out, align 1
store i32 1, i32* %out.gep.1, align 1
store i32 23, i32* %out.gep.2, align 1
ret void
}

; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
; ALIGNED: store i8
; ALIGNED: store i8
; ALIGNED: store i8

; UNALIGNED: store <3 x i8>
define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
%out.gep.1 = getelementptr i8, i8* %out, i8 1
%out.gep.2 = getelementptr i8, i8* %out, i8 2

store i8 9, i8* %out, align 1
store i8 1, i8* %out.gep.1, align 1
store i8 23, i8* %out.gep.2, align 1
ret void
}

attributes #0 = { nounwind }

0 comments on commit 210095c

Please sign in to comment.