Skip to content

Commit

Permalink
R600: Add dag combine for copy of an illegal type.
Browse files Browse the repository at this point in the history
This helps avoid redundant instructions to unpack, and repack
the vectors. Ideally we could recognize that pattern and eliminate
it. Currently v4i8 and other small element type vectors are scalarized,
so this has the added bonus of avoiding that.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213031 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Jul 15, 2014
1 parent 832e3ff commit 5fbf09a
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 13 deletions.
56 changes: 55 additions & 1 deletion lib/Target/R600/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :

setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);

setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
Expand Down Expand Up @@ -1896,6 +1897,56 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
return DAG.getConstant(Src0 >> Offset, MVT::i32);
}

static bool usesAllNormalStores(SDNode *LoadVal) {
for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
if (!ISD::isNormalStore(*I))
return false;
}

return true;
}

// If we have a copy of an illegal type, replace it with a load / store of an
// equivalently sized legal type. This avoids intermediate bit pack / unpack
// instructions emitted when handling extloads and truncstores. Ideally we could
// recognize the pack / unpack pattern to eliminate it.
SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!DCI.isBeforeLegalize())
return SDValue();

StoreSDNode *SN = cast<StoreSDNode>(N);
SDValue Value = SN->getValue();
EVT VT = Value.getValueType();

if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
return SDValue();

LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
return SDValue();

EVT MemVT = LoadVal->getMemoryVT();

SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);

SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
LoadVT, SL,
LoadVal->getChain(),
LoadVal->getBasePtr(),
LoadVal->getOffset(),
LoadVT,
LoadVal->getMemOperand());

SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);

return DAG.getStore(SN->getChain(), SL, NewLoad,
SN->getBasePtr(), SN->getMemOperand());
}

SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
Expand Down Expand Up @@ -1928,7 +1979,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
}

SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);

Expand Down Expand Up @@ -2026,6 +2077,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

break;
}

case ISD::STORE:
return performStoreCombine(N, DCI);
}
return SDValue();
}
Expand Down
1 change: 1 addition & 0 deletions lib/Target/R600/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class AMDGPUTargetLowering : public TargetLowering {
SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;

SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;

protected:
Expand Down
166 changes: 166 additions & 0 deletions test/CodeGen/R600/copy-illegal-type.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s

; FUNC-LABEL: @test_copy_v4i8
; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: S_ENDPGM
define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_x2
; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: S_ENDPGM
define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_x3
; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: S_ENDPGM
define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_x4
; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: BUFFER_STORE_DWORD [[REG]]
; SI: S_ENDPGM
define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_extra_use
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI_DAG: BUFFER_STORE_BYTE

; After scalarizing v4i8 loads is fixed.
; XSI: BUFFER_LOAD_DWORD
; XSI: V_BFE
; XSI: V_ADD
; XSI: V_ADD
; XSI: V_ADD
; XSI: BUFFER_STORE_DWORD
; XSI: BUFFER_STORE_DWORD

; SI: S_ENDPGM
define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_x2_extra_use
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: V_ADD
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI_DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI-DAG: BUFFER_STORE_BYTE
; SI_DAG: BUFFER_STORE_BYTE

; XSI: BUFFER_LOAD_DWORD
; XSI: BFE
; XSI: BUFFER_STORE_DWORD
; XSI: V_ADD
; XSI: BUFFER_STORE_DWORD
; XSI-NEXT: BUFFER_STORE_DWORD

; SI: S_ENDPGM
define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
ret void
}

; FUNC-LABEL: @test_copy_v3i8
; SI-NOT: BFE
; SI-NOT: BFI
; SI: S_ENDPGM
define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
%val = load <3 x i8> addrspace(1)* %in, align 4
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_volatile_load
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: S_ENDPGM
define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load volatile <4 x i8> addrspace(1)* %in, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
ret void
}

; FUNC-LABEL: @test_copy_v4i8_volatile_store
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_LOAD_UBYTE
; SI: BUFFER_STORE_BYTE
; SI: BUFFER_STORE_BYTE
; SI: BUFFER_STORE_BYTE
; SI: BUFFER_STORE_BYTE
; SI: S_ENDPGM
define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8> addrspace(1)* %in, align 4
store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
ret void
}
24 changes: 16 additions & 8 deletions test/CodeGen/R600/indirect-private-64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32

; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
; SI-PROMOTE: DS_READ_B64
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load <2 x double> addrspace(1)* %in, align 16
%array = alloca <2 x double>, i32 16, align 16
Expand Down Expand Up @@ -77,10 +81,14 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32

; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
; SI-PROMOTE: DS_READ_B64
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load <2 x i64> addrspace(1)* %in, align 16
%array = alloca <2 x i64>, i32 16, align 16
Expand Down
6 changes: 2 additions & 4 deletions test/CodeGen/R600/load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ entry:

; load a v2f32 value from the global address space
; FUNC-LABEL: @load_v2f32
; R600-CHECK: MEM_RAT
; R600-CHECK: VTX_READ_64

; SI-CHECK: BUFFER_LOAD_DWORDX2
define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
entry:
Expand All @@ -265,9 +265,7 @@ entry:
}

; FUNC-LABEL: @load_i64
; R600-CHECK: MEM_RAT
; R600-CHECK: MEM_RAT

; R600-CHECK: VTX_READ_64
; SI-CHECK: BUFFER_LOAD_DWORDX2
define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
entry:
Expand Down

0 comments on commit 5fbf09a

Please sign in to comment.