Skip to content

Commit

Permalink
[AMDGPU] Emit debugger prologue and emit the rest of the debugger fie…
Browse files Browse the repository at this point in the history
…lds in the kernel code header

Debugger prologue is emitted if -mattr=+amdgpu-debugger-emit-prologue.

Debugger prologue writes work group IDs and work item IDs to scratch memory at fixed location in the following format:
  - offset 0: work group ID x
  - offset 4: work group ID y
  - offset 8: work group ID z
  - offset 16: work item ID x
  - offset 20: work item ID y
  - offset 24: work item ID z

Set
  - amd_kernel_code_t::debug_wavefront_private_segment_offset_sgpr to scratch wave offset reg
  - amd_kernel_code_t::debug_private_segment_buffer_sgpr to scratch rsrc reg
  - amd_kernel_code_t::is_debug_supported to true if all debugger features are enabled

Differential Revision: http://reviews.llvm.org/D20335


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273769 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
kzhuravl committed Jun 25, 2016
1 parent 005f32a commit 20c7a48
Show file tree
Hide file tree
Showing 12 changed files with 287 additions and 5 deletions.
7 changes: 7 additions & 0 deletions lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,13 @@ def FeatureDebuggerReserveRegs : SubtargetFeature<
"Reserve registers for debugger usage"
>;

def FeatureDebuggerEmitPrologue : SubtargetFeature<
"amdgpu-debugger-emit-prologue",
"DebuggerEmitPrologue",
"true",
"Emit debugger prologue"
>;

//===----------------------------------------------------------------------===//

def AMDGPUInstrInfo : InstrInfo {
Expand Down
27 changes: 27 additions & 0 deletions lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
false);

if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
}

OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
false);
Expand Down Expand Up @@ -444,6 +451,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MaxVGPR += MFI->getDebuggerReservedVGPRCount();
}

// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
// attribute was specified.
if (STM.debuggerEmitPrologue()) {
ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
ProgInfo.DebuggerPrivateSegmentBufferSGPR =
RI->getHWRegIndex(MFI->getScratchRSrcReg());
}

// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
ProgInfo.NumVGPR = MaxVGPR + 1;
Expand Down Expand Up @@ -670,6 +687,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;

if (STM.debuggerSupported())
header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;

if (STM.isXNACKEnabled())
header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;

Expand All @@ -681,6 +701,13 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;

if (STM.debuggerEmitPrologue()) {
header.debug_wavefront_private_segment_offset_sgpr =
KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
header.debug_private_segment_buffer_sgpr =
KernelInfo.DebuggerPrivateSegmentBufferSGPR;
}

AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());

Expand Down
10 changes: 10 additions & 0 deletions lib/Target/AMDGPU/AMDGPUAsmPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
FlatUsed(false),
ReservedVGPRFirst(0),
ReservedVGPRCount(0),
DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
VCCUsed(false),
CodeLen(0) {}

Expand Down Expand Up @@ -75,6 +77,14 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
// The number of consecutive VGPRs reserved.
uint16_t ReservedVGPRCount;

// Fixed SGPR number used to hold wave scratch offset for entire kernel
// execution, or uint16_t(-1) if the register is not used or not known.
uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
// Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
// kernel execution, or uint16_t(-1) if the register is not used or not
// known.
uint16_t DebuggerPrivateSegmentBufferSGPR;

// Bonus information for debugging.
bool VCCUsed;
uint64_t CodeLen;
Expand Down
1 change: 1 addition & 0 deletions lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
EnableXNACK(false),
DebuggerInsertNops(false),
DebuggerReserveRegs(false),
DebuggerEmitPrologue(false),

EnableVGPRSpilling(false),
EnablePromoteAlloca(false),
Expand Down
10 changes: 10 additions & 0 deletions lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
bool EnableXNACK;
bool DebuggerInsertNops;
bool DebuggerReserveRegs;
bool DebuggerEmitPrologue;

// Used as options.
bool EnableVGPRSpilling;
Expand Down Expand Up @@ -402,6 +403,11 @@ class SISubtarget final : public AMDGPUSubtarget {
return EnableSIScheduler;
}

bool debuggerSupported() const {
return debuggerInsertNops() && debuggerReserveRegs() &&
debuggerEmitPrologue();
}

bool debuggerInsertNops() const {
return DebuggerInsertNops;
}
Expand All @@ -410,6 +416,10 @@ class SISubtarget final : public AMDGPUSubtarget {
return DebuggerReserveRegs;
}

bool debuggerEmitPrologue() const {
return DebuggerEmitPrologue;
}

bool loadStoreOptEnabled() const {
return EnableLoadStoreOpt;
}
Expand Down
50 changes: 49 additions & 1 deletion lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs() {

void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);

if (!MF.getFrameInfo()->hasStackObjects())
return;

Expand All @@ -54,7 +60,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
return;

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
Expand Down Expand Up @@ -87,6 +92,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.

// Debug location must be unknown since the first debug location is used to
// determine the end of the prologue.
DebugLoc DL;

unsigned FlatScratchInitReg
Expand Down Expand Up @@ -289,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
RS->addScavengingFrameIndex(ScavengeFI);
}
}

void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL;

// For each dimension:
for (unsigned i = 0; i < 3; ++i) {
// Get work group ID SGPR, and make it live-in again.
unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
MBB.addLiveIn(WorkGroupIDSGPR);

// Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
// order to spill it to scratch.
unsigned WorkGroupIDVGPR =
MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
.addReg(WorkGroupIDSGPR);

// Spill work group ID.
int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);

// Get work item ID VGPR, and make it live-in again.
unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
MBB.addLiveIn(WorkItemIDVGPR);

// Spill work item ID.
int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
}
}
4 changes: 4 additions & 0 deletions lib/Target/AMDGPU/SIFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
void processFunctionBeforeFrameFinalized(
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;

private:
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
};

}
Expand Down
31 changes: 31 additions & 0 deletions lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,11 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}

// Create stack objects that are used for emitting debugger prologue if
// "amdgpu-debugger-emit-prologue" attribute was specified.
if (ST.debuggerEmitPrologue())
createDebuggerPrologueStackObjects(MF);

SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());

Expand Down Expand Up @@ -1258,6 +1263,32 @@ bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
}
}

void SITargetLowering::createDebuggerPrologueStackObjects(
MachineFunction &MF) const {
// Create stack objects that are used for emitting debugger prologue.
//
// Debugger prologue writes work group IDs and work item IDs to scratch memory
// at fixed location in the following format:
// offset 0: work group ID x
// offset 4: work group ID y
// offset 8: work group ID z
// offset 16: work item ID x
// offset 20: work item ID y
// offset 24: work item ID z
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
int ObjectIdx = 0;

// For each dimension:
for (unsigned i = 0; i < 3; ++i) {
// Create fixed stack object for work group ID.
ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
// Create fixed stack object for work item ID.
ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
}
}

/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
Expand Down
2 changes: 2 additions & 0 deletions lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;

bool isCFIntrinsic(const SDNode *Intr) const;

void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
public:
SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);

Expand Down
10 changes: 6 additions & 4 deletions lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ReturnsVoid(true),
MaximumWorkGroupSize(0),
DebuggerReservedVGPRCount(0),
DebuggerWorkGroupIDStackObjectIndices{0, 0, 0},
DebuggerWorkItemIDStackObjectIndices{0, 0, 0},
LDSWaveSpillSize(0),
PSInputEna(0),
NumUserSGPRs(0),
Expand Down Expand Up @@ -92,16 +94,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDX = true;
}

if (F->hasFnAttribute("amdgpu-work-group-id-y"))
if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
WorkGroupIDY = true;

if (F->hasFnAttribute("amdgpu-work-group-id-z"))
if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
WorkGroupIDZ = true;

if (F->hasFnAttribute("amdgpu-work-item-id-y"))
if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
WorkItemIDY = true;

if (F->hasFnAttribute("amdgpu-work-item-id-z"))
if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
WorkItemIDZ = true;

// X, XY, and XYZ are the only supported combinations, so make sure Y is
Expand Down
60 changes: 60 additions & 0 deletions lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {

// Number of reserved VGPRs for debugger usage.
unsigned DebuggerReservedVGPRCount;
// Stack object indices for work group IDs.
int DebuggerWorkGroupIDStackObjectIndices[3];
// Stack object indices for work item IDs.
int DebuggerWorkItemIDStackObjectIndices[3];

public:
// FIXME: Make private
Expand Down Expand Up @@ -334,6 +338,62 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
return DebuggerReservedVGPRCount;
}

/// \returns Stack object index for \p Dim's work group ID.
int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
assert(Dim < 3);
return DebuggerWorkGroupIDStackObjectIndices[Dim];
}

/// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
}

/// \returns Stack object index for \p Dim's work item ID.
int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
assert(Dim < 3);
return DebuggerWorkItemIDStackObjectIndices[Dim];
}

/// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
assert(Dim < 3);
DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
}

/// \returns SGPR used for \p Dim's work group ID.
unsigned getWorkGroupIDSGPR(unsigned Dim) const {
switch (Dim) {
case 0:
assert(hasWorkGroupIDX());
return WorkGroupIDXSystemSGPR;
case 1:
assert(hasWorkGroupIDY());
return WorkGroupIDYSystemSGPR;
case 2:
assert(hasWorkGroupIDZ());
return WorkGroupIDZSystemSGPR;
}
llvm_unreachable("unexpected dimension");
}

/// \returns VGPR used for \p Dim' work item ID.
unsigned getWorkItemIDVGPR(unsigned Dim) const {
switch (Dim) {
case 0:
assert(hasWorkItemIDX());
return AMDGPU::VGPR0;
case 1:
assert(hasWorkItemIDY());
return AMDGPU::VGPR1;
case 2:
assert(hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected dimension");
}

unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};

Expand Down
Loading

0 comments on commit 20c7a48

Please sign in to comment.