Skip to content

Commit

Permalink
AMDGPU: Always allocate emergency stack slot at offset 0
Browse files Browse the repository at this point in the history
This allows us to ensure that 0 is never a valid pointer
to a user object, and ensures that the offset is always legal
without needing a register to access it. This comes at the cost
of usable offsets and wasted stack space.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295877 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Feb 22, 2017
1 parent ca81270 commit 138d429
Show file tree
Hide file tree
Showing 19 changed files with 222 additions and 174 deletions.
24 changes: 19 additions & 5 deletions lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,11 +391,25 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (!MFI.hasStackObjects())
return;

assert(RS && "RegScavenger required if spilling");
int ScavengeFI = MFI.CreateStackObject(
AMDGPU::SGPR_32RegClass.getSize(),
AMDGPU::SGPR_32RegClass.getAlignment(), false);
RS->addScavengingFrameIndex(ScavengeFI);
bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
if (MayNeedScavengingEmergencySlot) {
// We force this to be at offset 0 so no user object ever has 0 as an
// address, so we may use 0 as an invalid pointer value. This is because
// LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
// is required to be address space 0, we are forced to accept this for
// now. Ideally we could have the stack in another address space with 0 as a
// valid pointer, and -1 as the null value.
//
// This will also waste additional space when user stack objects require > 4
// byte alignment.
//
// The main cost here is losing the offset for addressing modes. However
// this also ensures we shouldn't need a register for the offset when
// emergency scavenging.
int ScavengeFI = MFI.CreateFixedObject(
AMDGPU::SGPR_32RegClass.getSize(), 0, false);
RS->addScavengingFrameIndex(ScavengeFI);
}

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
Expand Down
12 changes: 6 additions & 6 deletions test/CodeGen/AMDGPU/amdgpu.private-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ for.end:

; R600: MOVA_INT

; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x68,0xe0
; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:2 ; encoding: [0x02,0x00,0x68,0xe0
; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0
; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0
; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}

Expand All @@ -253,11 +253,11 @@ entry:

; R600: MOVA_INT

; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding:
; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding:
; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:

; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
entry:
%0 = alloca [2 x i8]
Expand Down
6 changes: 1 addition & 5 deletions test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2

; SI-LABEL: {{^}}test_private_array_ptr_calc:

; FIXME: We end up with zero argument for ADD, because
; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
; with the appropriate offset. We should fold this into the store.

; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
; SI-ALLOCA: s_barrier
; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
Expand Down
50 changes: 25 additions & 25 deletions test/CodeGen/AMDGPU/captured-frame-index.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn-- -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}store_fi_lifetime:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[FI]]
define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
Expand All @@ -15,7 +15,7 @@ entry:
; GCN-LABEL: {{^}}stored_fi_to_lds:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: buffer_store_dword v{{[0-9]+}}, off,
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
Expand All @@ -27,16 +27,16 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {

; Offset is applied
; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}

; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]

; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]]

; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]]
define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
%tmp0 = alloca float
Expand All @@ -51,9 +51,9 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
; Same frame index is used multiple times in the store
; GCN-LABEL: {{^}}stored_fi_to_self:
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
define void @stored_fi_to_self() #0 {
%tmp = alloca i32*

Expand All @@ -66,13 +66,13 @@ define void @stored_fi_to_self() #0 {

; GCN-LABEL: {{^}}stored_fi_to_self_offset:
; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}

; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}

; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
define void @stored_fi_to_self_offset() #0 {
%tmp0 = alloca [512 x i32]
%tmp1 = alloca i32*
Expand All @@ -89,15 +89,15 @@ define void @stored_fi_to_self_offset() #0 {
}

; GCN-LABEL: {{^}}stored_fi_to_fi:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}

; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}

; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
define void @stored_fi_to_fi() #0 {
%tmp0 = alloca i32*
%tmp1 = alloca i32*
Expand All @@ -115,8 +115,8 @@ define void @stored_fi_to_fi() #0 {
}

; GCN-LABEL: {{^}}stored_fi_to_global:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[FI]]
define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
%tmp = alloca float
Expand All @@ -127,14 +127,14 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {

; Offset is applied
; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}

; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}

; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
%tmp0 = alloca float
Expand All @@ -150,10 +150,10 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {

; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}

; FIXME: Re-initialize
; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}}

; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
Expand Down Expand Up @@ -184,7 +184,7 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[FI]]
define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
entry:
Expand Down
45 changes: 43 additions & 2 deletions test/CodeGen/AMDGPU/cgp-addressing-modes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,47 @@ done:
; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
; GCN: {{^}}BB4_2:
define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
%alloca = alloca [512 x i32], align 4
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%add.arg = add i32 %arg, 8
%alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if

if:
store volatile i32 123, i32* %alloca.gep
%tmp1 = load volatile i32, i32* %alloca.gep
br label %endif

endif:
%x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
store i32 %x, i32 addrspace(1)* %out.gep.0
%load = load volatile i32, i32* %alloca.gep
store i32 %load, i32 addrspace(1)* %out.gep.1
br label %done

done:
ret void
}

; This ends up not fitting due to the reserved 4 bytes at offset 0
; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
; OPT-NOT: getelementptr [512 x i32]
; OPT: br i1
; OPT: ptrtoint

; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
; GCN: s_and_saveexec_b64
; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: {{^BB[0-9]+}}_2:

define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
%alloca = alloca [512 x i32], align 4
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
Expand Down Expand Up @@ -165,7 +206,7 @@ done:
; GCN: s_and_saveexec_b64
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: {{^}}BB5_2:
; GCN: {{^BB[0-9]+}}_2:
define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
%alloca = alloca [512 x i32], align 4
Expand Down Expand Up @@ -197,7 +238,7 @@ done:
; GCN: s_and_saveexec_b64
; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
; GCN: {{^}}BB6_2:
; GCN: {{^BB[0-9]+}}_2:
define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
entry:
%offset.ext = zext i32 %offset to i64
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/AMDGPU/commute-compares.ll
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_frameindex:
; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}

; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
entry:
Expand Down
32 changes: 15 additions & 17 deletions test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@


; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill

; Spill load
; VMEM: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VGPR: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

Expand All @@ -44,8 +43,7 @@
; GCN: {{^}}BB{{[0-9]+}}_1: ; %if
; GCN: s_mov_b32 m0, -1
; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
; VMEM: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; VGPR: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: s_waitcnt vmcnt(0)

; Spill val register
Expand All @@ -60,11 +58,11 @@



; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

Expand Down Expand Up @@ -107,17 +105,17 @@ endif:
; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill

; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]


; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill

; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

Expand All @@ -127,7 +125,7 @@ endif:


; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: s_and_b64 vcc, exec, vcc
Expand All @@ -140,11 +138,11 @@ endif:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

Expand Down Expand Up @@ -187,7 +185,7 @@ end:
; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill

; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
Expand Down Expand Up @@ -244,14 +242,14 @@ end:

; GCN: BB{{[0-9]+}}_2: ; %if
; GCN: ds_read_b32
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]

; GCN: [[ELSE]]: ; %else
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_waitcnt vmcnt(0) expcnt(0)
Expand Down
Loading

0 comments on commit 138d429

Please sign in to comment.