Skip to content

Commit

Permalink
AMDGPU/SI: Handle hazard with > 8 byte VMEM stores
Browse files Browse the repository at this point in the history
Reviewers: arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D25577

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285359 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
tstellarAMD committed Oct 27, 2016
1 parent f95be0d commit 5480a24
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 24 deletions.
4 changes: 4 additions & 0 deletions lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,10 @@ class SISubtarget final : public AMDGPUSubtarget {
return SGPRInitBug;
}

bool has12DWordStoreHazard() const {
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}

unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
Expand Down
1 change: 1 addition & 0 deletions lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,7 @@ defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_si <0x5a>;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
Expand Down
127 changes: 107 additions & 20 deletions lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
return NoopHazard;

if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return NoopHazard;

if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
return NoopHazard;

Expand All @@ -90,14 +93,20 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(0, checkSMRDHazards(MI));

if (SIInstrInfo::isVMEM(*MI))
return std::max(0, checkVMEMHazards(MI));
if (SIInstrInfo::isVALU(*MI)) {
int WaitStates = std::max(0, checkVALUHazards(MI));

if (SIInstrInfo::isDPP(*MI))
return std::max(0, checkDPPHazards(MI));
if (SIInstrInfo::isVMEM(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));

if (isDivFMas(MI->getOpcode()))
return std::max(0, checkDivFMasHazards(MI));
if (SIInstrInfo::isDPP(*MI))
WaitStates = std::max(WaitStates, checkDPPHazards(MI));

if (isDivFMas(MI->getOpcode()))
WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));

return WaitStates;
}

if (isSGetReg(MI->getOpcode()))
return std::max(0, checkGetRegHazards(MI));
Expand Down Expand Up @@ -149,32 +158,38 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//

int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
int GCNHazardRecognizer::getWaitStatesSince(
function_ref<bool(MachineInstr *)> IsHazard) {

int WaitStates = -1;
for (MachineInstr *MI : EmittedInstrs) {
++WaitStates;
if (!MI || !IsHazardDef(MI))
if (!MI || !IsHazard(MI))
continue;
if (MI->modifiesRegister(Reg, TRI))
return WaitStates;
return WaitStates;
}
return std::numeric_limits<int>::max();
}

int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();

auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
};

return getWaitStatesSince(IsHazardFn);
}

int GCNHazardRecognizer::getWaitStatesSinceSetReg(
function_ref<bool(MachineInstr *)> IsHazard) {

int WaitStates = -1;
for (MachineInstr *MI : EmittedInstrs) {
++WaitStates;
if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI))
continue;
return WaitStates;
}
return std::numeric_limits<int>::max();
auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
return isSSetReg(MI->getOpcode()) && IsHazard(MI);
};

return getWaitStatesSince(IsHazardFn);
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -350,3 +365,75 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
return SetRegWaitStates - WaitStatesNeeded;
}

int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
if (!MI.mayStore())
return -1;

const SIInstrInfo *TII = ST.getInstrInfo();
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MI.getDesc();

int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
int VDataRCID = -1;
if (VDataIdx != -1)
VDataRCID = Desc.OpInfo[VDataIdx].RegClass;

if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
// For MUBUF/MTBUF instructions this hazard only exists if the
// instruction is not using a register in the soffset field.
const MachineOperand *SOffset =
TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
// If we have no soffset operand, then assume this field has been
// hardcoded to zero.
if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
(!SOffset || !SOffset->isReg()))
return VDataIdx;
}

// MIMG instructions create a hazard if they don't use a 256-bit T# and
// the store size is greater than 8 bytes and they have more than two bits
// of their dmask set.
// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
if (TII->isMIMG(MI)) {
int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
assert(SRsrcIdx != -1 &&
AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
}

if (TII->isFLAT(MI)) {
int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data);
if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
return DataIdx;
}

return -1;
}

int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
// This checks for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
if (!ST.has12DWordStoreHazard())
return 0;

const SIRegisterInfo *TRI = ST.getRegisterInfo();
const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();

const int VALUWaitStates = 1;
int WaitStatesNeeded = 0;

for (const MachineOperand &Def : VALU->defs()) {
if (!TRI->isVGPR(MRI, Def.getReg()))
continue;
unsigned Reg = Def.getReg();
auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
int DataIdx = createsVALUHazard(*MI);
return DataIdx >= 0 &&
TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
VALUWaitStates - getWaitStatesSince(IsHazardFn);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
return WaitStatesNeeded;
}
3 changes: 3 additions & 0 deletions lib/Target/AMDGPU/GCNHazardRecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const MachineFunction &MF;
const SISubtarget &ST;

int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
int getWaitStatesSinceDef(unsigned Reg,
function_ref<bool(MachineInstr *)> IsHazardDef =
[](MachineInstr *) { return true; });
Expand All @@ -47,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkDivFMasHazards(MachineInstr *DivFMas);
int checkGetRegHazards(MachineInstr *GetRegInstr);
int checkSetRegHazards(MachineInstr *SetRegInstr);
int createsVALUHazard(const MachineInstr &MI);
int checkVALUHazards(MachineInstr *VALU);
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.
Expand Down
8 changes: 6 additions & 2 deletions lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {

// Avoid using MCRegisterClass::getSize, since that function will go away
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(const MCRegisterClass &RC) {
switch (RC.getID()) {
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
case AMDGPU::VS_32RegClassID:
Expand Down Expand Up @@ -382,6 +382,10 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
}
}

unsigned getRegBitWidth(const MCRegisterClass &RC) {
return getRegBitWidth(RC.getID());
}

unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo) {
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
Expand Down
3 changes: 3 additions & 0 deletions lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
/// \brief Does this opearnd support only inlinable literals?
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);

/// \brief Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(unsigned RCID);

/// \brief Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);

Expand Down
79 changes: 77 additions & 2 deletions test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI
# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI

--- |
define void @div_fmas() { ret void }
define void @s_getreg() { ret void }
define void @s_setreg() { ret void }
define void @vmem_gt_8dw_store() { ret void }
...
---
# GCN-LABEL: name: div_fmas
Expand Down Expand Up @@ -159,3 +160,77 @@ body: |
S_SETREG_B32 %sgpr1, 0
S_ENDPGM
...

...
---
# GCN-LABEL: name: vmem_gt_8dw_store

# GCN-LABEL: bb.0:
# GCN: BUFFER_STORE_DWORD_OFFSET
# GCN-NEXT: V_MOV_B32
# GCN: BUFFER_STORE_DWORDX3_OFFSET
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: BUFFER_STORE_DWORDX4_OFFSET
# GCN-NEXT: V_MOV_B32
# GCN: BUFFER_STORE_DWORDX4_OFFSET
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32

# GCN-LABEL: bb.1:
# GCN: FLAT_STORE_DWORDX2
# GCN-NEXT: V_MOV_B32
# GCN: FLAT_STORE_DWORDX3
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: FLAT_STORE_DWORDX4
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: FLAT_ATOMIC_CMPSWAP_X2
# CIVI: S_NOP
# GCN-NEXT: V_MOV_B32
# GCN: FLAT_ATOMIC_FCMPSWAP_X2
# CIVI: S_NOP
# GCN: V_MOV_B32

name: vmem_gt_8dw_store

body: |
bb.0:
successors: %bb.1
BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
S_BRANCH %bb.1
bb.1:
FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
S_ENDPGM
...

0 comments on commit 5480a24

Please sign in to comment.