Skip to content

Commit

Permalink
AMDGPU/R600: Use correct number of vector elements when lowering priv…
Browse files Browse the repository at this point in the history
…ate loads

Reviewer: tstellardAMD, arsenm

Subscribers: arsenm, kzhuravl, llvm-commits

Differential Revision: http://reviews.llvm.org/D20032

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@269725 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
jvesely committed May 16, 2016
1 parent dd6e786 commit 350e40f
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 5 deletions.
8 changes: 3 additions & 5 deletions lib/Target/AMDGPU/R600ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1679,6 +1679,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT ElemVT = VT.getVectorElementType();
SDValue Loads[4];

assert(NumElemVT <= 4);
assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
"vector width in load");

Expand All @@ -1692,11 +1693,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
DAG.getTargetConstant(Channel, DL, MVT::i32),
Op.getOperand(2));
}
for (unsigned i = NumElemVT; i < 4; ++i) {
Loads[i] = DAG.getUNDEF(ElemVT);
}
EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
LoweredLoad = DAG.getBuildVector(TargetVT, DL, Loads);
EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
} else {
LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
Chain, Ptr,
Expand Down
105 changes: 105 additions & 0 deletions test/CodeGen/AMDGPU/amdgpu.private-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -401,4 +401,109 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {

; NOHSAOPT: !0 = !{i32 0, i32 2048}


; FUNC-LABEL: v16i32_stack:

; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT

; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword

define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
%alloca = alloca [2 x <16 x i32>]
%tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
%tmp5 = load <16 x i32>, <16 x i32>* %tmp0
store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
ret void
}

; FUNC-LABEL: v16float_stack:

; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT
; R600: MOVA_INT

; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword

define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
%alloca = alloca [2 x <16 x float>]
%tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
%tmp5 = load <16 x float>, <16 x float>* %tmp0
store <16 x float> %tmp5, <16 x float> addrspace(1)* %out
ret void
}

; FUNC-LABEL: v2float_stack:

; R600: MOVA_INT
; R600: MOVA_INT

; SI: buffer_load_dword
; SI: buffer_load_dword

define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
%alloca = alloca [16 x <2 x float>]
%tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
%tmp5 = load <2 x float>, <2 x float>* %tmp0
store <2 x float> %tmp5, <2 x float> addrspace(1)* %out
ret void
}

attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }

0 comments on commit 350e40f

Please sign in to comment.