Skip to content

Commit

Permalink
Merging r260658:
Browse files Browse the repository at this point in the history
------------------------------------------------------------------------
r260658 | Matthew.Arsenault | 2016-02-11 22:31:30 -0800 (Thu, 11 Feb 2016) | 12 lines

AMDGPU: Set flat_scratch from flat_scratch_init reg

This was hardcoded to the static private size, but this
would be missing the offset and additional size for someday
when we have dynamic sizing.

Also stops always initializing flat_scratch even when unused.

In the future we should stop emitting this unless flat instructions
are used to access private memory. For example this will initialize
it almost always on VI because flat is used for global access.

------------------------------------------------------------------------

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271684 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
tstellarAMD committed Jun 3, 2016
1 parent 737edaf commit b4989f0
Show file tree
Hide file tree
Showing 14 changed files with 142 additions and 155 deletions.
57 changes: 42 additions & 15 deletions lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,8 @@ using namespace llvm;

static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
const MachineFrameInfo *FrameInfo) {
if (!FuncInfo->hasSpilledSGPRs())
return false;

if (FuncInfo->hasSpilledVGPRs())
return false;

for (int I = FrameInfo->getObjectIndexBegin(),
E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
if (!FrameInfo->isSpillSlotObjectIndex(I))
return false;
}

return true;
return FuncInfo->hasSpilledSGPRs() &&
(!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
}

static ArrayRef<MCPhysReg> getAllSGPR128() {
Expand Down Expand Up @@ -67,6 +56,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator I = MBB.begin();

// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
Expand All @@ -84,6 +75,44 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}

if (MFI->hasFlatScratchInit()) {
// We don't need this if we only have spills since there is no user facing
// scratch.

// TODO: If we know we don't have flat instructions earlier, we can omit
// this from the input registers.
//
// TODO: We only need to know if we access scratch space through a flat
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.

DebugLoc DL;

unsigned FlatScratchInitReg
= TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);

MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);

// Copy the size in bytes.
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);

unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);

// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);

// Convert offset to 256-byte units.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
}

// If we reserved the original input registers, we don't need to copy to the
// reserved registers.
if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
Expand All @@ -96,7 +125,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,

// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);

Expand Down Expand Up @@ -160,7 +188,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));

const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL;

if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
Expand Down
11 changes: 10 additions & 1 deletion lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,12 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(InputPtrReg);
}

if (Info->hasFlatScratchInit()) {
unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}

AnalyzeFormalArguments(CCInfo, Splits);

SmallVector<SDValue, 16> Chains;
Expand Down Expand Up @@ -812,8 +818,11 @@ SDValue SITargetLowering::LowerFormalArguments(

// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.

bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
if (HasStackObjects)
Info->setHasNonSpillStackObjects(true);

if (ST.isAmdHsaOS()) {
// TODO: Assume we will spill without optimizations.
Expand Down
38 changes: 3 additions & 35 deletions lib/Target/AMDGPU/SILowerControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,43 +572,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}

// FIXME: This seems inappropriate to do here.
if (NeedFlat && MFI->IsKernel) {
// Insert the prologue initializing the SGPRs pointing to the scratch space
// for flat accesses.
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();

// TODO: What to use with function calls?

// FIXME: This is reporting stack size that is used in a scratch buffer
// rather than registers as well.
uint64_t StackSizeBytes = FrameInfo->getStackSize();

int IndirectBegin
= static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
// Convert register index to 256-byte unit.
uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);

assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
"Stack limits should be smaller than 16-bits");

// Initialize the flat scratch register pair.
// TODO: Can we use one s_mov_b64 here?

// Offset is in units of 256-bytes.
MachineBasicBlock &MBB = MF.front();
DebugLoc NoDL;
MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);

assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));

BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
.addImm(StackOffset);

// Documentation says size is "per-thread scratch size in bytes"
BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
.addImm(StackSizeBytes);
// We will need to Initialize the flat scratch register pair.
if (NeedFlat)
MFI->setHasFlatInstructions(true);
}

return true;
Expand Down
24 changes: 20 additions & 4 deletions lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
NumSystemSGPRs(0),
HasSpilledSGPRs(false),
HasSpilledVGPRs(false),
HasNonSpillStackObjects(false),
HasFlatInstructions(false),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
Expand Down Expand Up @@ -93,6 +95,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F->hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;

// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
if (WorkItemIDZ)
WorkItemIDY = true;

bool MaySpill = ST.isVGPRSpillingEnabled(this);
bool HasStackObjects = FrameInfo->hasStackObjects();

Expand All @@ -107,10 +114,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
DispatchPtr = true;
}

// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
if (WorkItemIDZ)
WorkItemIDY = true;
// We don't need to worry about accessing spills with flat instructions.
// TODO: On VI where we must use flat for global, we should be able to omit
// this if it is never used for generic access.
if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
ST.isAmdHsaOS())
FlatScratchInit = true;
}

unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
Expand Down Expand Up @@ -142,6 +151,13 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
return KernargSegmentPtrUserSGPR;
}

unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
return FlatScratchInitUserSGPR;
}

SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
MachineFunction *MF,
unsigned FrameIndex,
Expand Down
19 changes: 19 additions & 0 deletions lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
private:
bool HasSpilledSGPRs;
bool HasSpilledVGPRs;
bool HasNonSpillStackObjects;
bool HasFlatInstructions;

// Feature bits required for inputs passed in user SGPRs.
bool PrivateSegmentBuffer : 1;
Expand Down Expand Up @@ -129,6 +131,7 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
unsigned addQueuePtr(const SIRegisterInfo &TRI);
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);

// Add system SGPRs.
unsigned addWorkGroupIDX() {
Expand Down Expand Up @@ -277,6 +280,22 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
HasSpilledVGPRs = Spill;
}

bool hasNonSpillStackObjects() const {
return HasNonSpillStackObjects;
}

void setHasNonSpillStackObjects(bool StackObject = true) {
HasNonSpillStackObjects = StackObject;
}

bool hasFlatInstructions() const {
return HasFlatInstructions;
}

void setHasFlatInstructions(bool UseFlat = true) {
HasFlatInstructions = UseFlat;
}

unsigned getPSInputAddr() const {
return PSInputAddr;
}
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
case SIRegisterInfo::DISPATCH_ID:
llvm_unreachable("unimplemented");
case SIRegisterInfo::FLAT_SCRATCH_INIT:
assert(MFI->hasFlatScratchInit());
return MFI->FlatScratchInitUserSGPR;
case SIRegisterInfo::DISPATCH_PTR:
assert(MFI->hasDispatchPtr());
return MFI->DispatchPtrUserSGPR;
Expand Down
4 changes: 3 additions & 1 deletion lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {

enum PreloadedValue {
// SGPRS:
PRIVATE_SEGMENT_BUFFER = 0,
PRIVATE_SEGMENT_BUFFER = 0,
DISPATCH_PTR = 1,
QUEUE_PTR = 2,
KERNARG_SEGMENT_PTR = 3,
DISPATCH_ID = 4,
FLAT_SCRATCH_INIT = 5,
WORKGROUP_ID_X = 10,
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
Expand Down
2 changes: 0 additions & 2 deletions test/CodeGen/AMDGPU/cgp-addressing-modes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,6 @@ done:
}

; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
; GCN: s_and_saveexec_b64
; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
Expand Down
19 changes: 15 additions & 4 deletions test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s


; There are no stack objects even though flat is used by default, so
; flat_scratch_init should be disabled.

; ALL-LABEL: {{^}}test:
; HSA: .amd_kernel_code_t
; HSA: enable_sgpr_flat_scratch_init = 0
; HSA: .end_amd_kernel_code_t

; ALL-NOT: flat_scr

; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword

; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
define void @test(i32 addrspace(1)* %out) {
Expand Down
Loading

0 comments on commit b4989f0

Please sign in to comment.