Skip to content

Commit

Permalink
AMDGPU: Refactor frame lowering
Browse files Browse the repository at this point in the history
This will make future changes easier.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@280296 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Aug 31, 2016
1 parent 4d1149d commit 8cf15c6
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 121 deletions.
279 changes: 158 additions & 121 deletions lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,145 @@ static ArrayRef<MCPhysReg> getAllSGPRs() {
AMDGPU::SGPR_32RegClass.getNumRegs());
}

void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
const SIRegisterInfo* TRI,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
// We don't need this if we only have spills since there is no user facing
// scratch.

// TODO: If we know we don't have flat instructions earlier, we can omit
// this from the input registers.
//
// TODO: We only need to know if we access scratch space through a flat
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.

// Debug location must be unknown since the first debug location is used to
// determine the end of the prologue.
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();

unsigned FlatScratchInitReg
= TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);

MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);

// Copy the size in bytes.
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);

unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();


// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);

// Convert offset to 256-byte units.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
}

unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const {

// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
assert(ScratchRsrcReg != AMDGPU::NoRegister);

if (ST.hasSGPRInitBug() ||
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
return ScratchRsrcReg;

// We reserved the last registers for this. Shift it down to the end of those
// which were actually used.
//
// FIXME: It might be safer to use a pseudoregister before replacement.

// FIXME: We should be able to eliminate unused input registers. We only
// cannot do this for the resources required for scratch access. For now we
// skip over user SGPRs and may leave unused holes.

// We find the resource first because it has an alignment requirement.

MachineRegisterInfo &MRI = MF.getRegInfo();

unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
if (!MRI.isPhysRegUsed(Reg)) {
assert(MRI.isAllocatable(Reg));
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
}
}

return ScratchRsrcReg;
}

unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const {
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
if (ST.hasSGPRInitBug() ||
ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
return ScratchWaveOffsetReg;

unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();

// We need to drop register from the end of the list that we cannot use
// for the scratch wave offset.
// + 2 s102 and s103 do not exist on VI.
// + 2 for vcc
// + 2 for xnack_mask
// + 2 for flat_scratch
// + 4 for registers reserved for scratch resource register
// + 1 for register reserved for scratch wave offset. (By exluding this
// register from the list to consider, it means that when this
// register is being used for the scratch wave offset and there
// are no other free SGPRs, then the value will stay in this register.
// ----
// 13
for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
// scratch descriptor, since we haven’t added its uses yet.
if (!MRI.isPhysRegUsed(Reg)) {
if (!MRI.isAllocatable(Reg) ||
TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
continue;

MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
MFI->setScratchWaveOffsetReg(Reg);
return Reg;
}
}

return ScratchWaveOffsetReg;
}

void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
Expand All @@ -63,15 +202,19 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator I = MBB.begin();

// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
unsigned ScratchRsrcReg
= getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
unsigned ScratchWaveOffsetReg
= getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
assert(ScratchRsrcReg != AMDGPU::NoRegister);

unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));

if (MFI->hasFlatScratchInit())
emitFlatScratchInit(TII, TRI, MF, MBB);

// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);

Expand All @@ -81,46 +224,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}

if (MFI->hasFlatScratchInit()) {
// We don't need this if we only have spills since there is no user facing
// scratch.

// TODO: If we know we don't have flat instructions earlier, we can omit
// this from the input registers.
//
// TODO: We only need to know if we access scratch space through a flat
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.

// Debug location must be unknown since the first debug location is used to
// determine the end of the prologue.
DebugLoc DL;

unsigned FlatScratchInitReg
= TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);

MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);

// Copy the size in bytes.
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);

unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);

// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);

// Convert offset to 256-byte units.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitLo, RegState::Kill)
.addImm(8);
}

// If we reserved the original input registers, we don't need to copy to the
// reserved registers.
if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
Expand All @@ -130,7 +233,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
return;
}


// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
Expand All @@ -141,79 +243,23 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedPrivateBufferReg);
}

if (!ST.hasSGPRInitBug()) {
// We reserved the last registers for this. Shift it down to the end of those
// which were actually used.
//
// FIXME: It might be safer to use a pseudoregister before replacement.

// FIXME: We should be able to eliminate unused input registers. We only
// cannot do this for the resources required for scratch access. For now we
// skip over user SGPRs and may leave unused holes.

// We find the resource first because it has an alignment requirement.
if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();

unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
if (!MRI.isPhysRegUsed(Reg)) {
assert(MRI.isAllocatable(Reg));
MRI.replaceRegWith(ScratchRsrcReg, Reg);
ScratchRsrcReg = Reg;
MFI->setScratchRSrcReg(ScratchRsrcReg);
break;
}
}
}
// Make the register selected live throughout the function.
for (MachineBasicBlock &OtherBB : MF) {
if (&OtherBB == &MBB)
continue;

if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();

// We need to drop register from the end of the list that we cannot use
// for the scratch wave offset.
// + 2 s102 and s103 do not exist on VI.
// + 2 for vcc
// + 2 for xnack_mask
// + 2 for flat_scratch
// + 4 for registers reserved for scratch resource register
// + 1 for register reserved for scratch wave offset. (By exluding this
// register from the list to consider, it means that when this
// register is being used for the scratch wave offset and there
// are no other free SGPRs, then the value will stay in this register.
// ----
// 13
for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
// scratch descriptor, since we haven’t added its uses yet.
if (!MRI.isPhysRegUsed(Reg)) {
if (!MRI.isAllocatable(Reg) ||
TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
continue;

MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
ScratchWaveOffsetReg = Reg;
MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
break;
}
}
}
OtherBB.addLiveIn(ScratchRsrcReg);
OtherBB.addLiveIn(ScratchWaveOffsetReg);
}


assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));

const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();

if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
// Make sure we emit the copy for the offset first. We may have chosen to copy
// the buffer resource into a register that aliases the input offset register.
// Make sure we emit the copy for the offset first. We may have chosen to
// copy the buffer resource into a register that aliases the input offset
// register.
BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
}
Expand Down Expand Up @@ -260,15 +306,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
}

// Make the register selected live throughout the function.
for (MachineBasicBlock &OtherBB : MF) {
if (&OtherBB == &MBB)
continue;

OtherBB.addLiveIn(ScratchRsrcReg);
OtherBB.addLiveIn(ScratchWaveOffsetReg);
}
}

void SIFrameLowering::emitEpilogue(MachineFunction &MF,
Expand Down
23 changes: 23 additions & 0 deletions lib/Target/AMDGPU/SIFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#include "AMDGPUFrameLowering.h"

namespace llvm {
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
class SISubtarget;

class SIFrameLowering final : public AMDGPUFrameLowering {
public:
Expand All @@ -31,6 +35,25 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
RegScavenger *RS = nullptr) const override;

private:
void emitFlatScratchInit(const SIInstrInfo *TII,
const SIRegisterInfo* TRI,
MachineFunction &MF,
MachineBasicBlock &MBB) const;

unsigned getReservedPrivateSegmentBufferReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;

unsigned getReservedPrivateSegmentWaveByteOffsetReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;

/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
};
Expand Down

0 comments on commit 8cf15c6

Please sign in to comment.