Skip to content

Commit

Permalink
[AArch64][GlobalISel] CodeGen for Armv8.8/9.3 MOPS
Browse files Browse the repository at this point in the history
This implements codegen for Armv8.8/9.3 Memory Operations extension (MOPS).
Any memcpy/memset/memmov intrinsics will always be emitted as a series
of three consecutive instructions P, M and E which perform the
operation. The SelectionDAG implementation is split into a separate
patch.

AArch64LegalizerInfo will now consider the following generic opcodes
if +mops is available, instead of legalising by expanding them to
libcalls: G_BZERO, G_MEMCPY_INLINE, G_MEMCPY, G_MEMMOVE, G_MEMSET
The s8 value of memset is legalised to s64 to match the pseudos.

AArch64O0PreLegalizerCombinerInfo will still be able to combine
G_MEMCPY_INLINE even if +mops is present, as it is unclear whether it is
better to generate fixed length copies or MOPS instructions for the
inline code of small or zero-sized memory operations, so we choose to be
conservative for now.

AArch64InstructionSelector will select the above as new pseudo
instructions: AArch64::MOPSMemory{Copy/Move/Set/SetTagging} These are
each expanded to a series of three instructions (e.g. SETP/SETM/SETE)
which must be emitted together during code emission to avoid scheduler
reordering.

This is part 3/4 of a series of patches split from
https://reviews.llvm.org/D117405 to facilitate reviewing.

Patch by Tomas Matheson and Son Tuan Vu

Differential Revision: https://reviews.llvm.org/D117763
  • Loading branch information
tyb-arm committed Jan 31, 2022
1 parent 6ec9fd2 commit 78fd413
Show file tree
Hide file tree
Showing 10 changed files with 1,592 additions and 12 deletions.
31 changes: 21 additions & 10 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,16 +432,6 @@ class LegalizeRuleSet {
return TypeIdx;
}

unsigned immIdx(unsigned ImmIdx) {
assert(ImmIdx <= (MCOI::OPERAND_LAST_GENERIC_IMM -
MCOI::OPERAND_FIRST_GENERIC_IMM) &&
"Imm Index is out of bounds");
#ifndef NDEBUG
ImmIdxsCovered.set(ImmIdx);
#endif
return ImmIdx;
}

void markAllIdxsAsCovered() {
#ifndef NDEBUG
TypeIdxsCovered.set();
Expand Down Expand Up @@ -568,6 +558,16 @@ class LegalizeRuleSet {
}
unsigned getAlias() const { return AliasOf; }

unsigned immIdx(unsigned ImmIdx) {
assert(ImmIdx <= (MCOI::OPERAND_LAST_GENERIC_IMM -
MCOI::OPERAND_FIRST_GENERIC_IMM) &&
"Imm Index is out of bounds");
#ifndef NDEBUG
ImmIdxsCovered.set(ImmIdx);
#endif
return ImmIdx;
}

/// The instruction is legal if predicate is true.
LegalizeRuleSet &legalIf(LegalityPredicate Predicate) {
// We have no choice but conservatively assume that the free-form
Expand Down Expand Up @@ -824,11 +824,22 @@ class LegalizeRuleSet {
LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
return actionForCartesianProduct(LegalizeAction::Custom, Types);
}
/// The instruction is custom when type indexes 0 and 1 are both in their
/// respective lists.
LegalizeRuleSet &
customForCartesianProduct(std::initializer_list<LLT> Types0,
std::initializer_list<LLT> Types1) {
return actionForCartesianProduct(LegalizeAction::Custom, Types0, Types1);
}
/// The instruction is custom when when type indexes 0, 1, and 2 are all in
/// their respective lists.
LegalizeRuleSet &
customForCartesianProduct(std::initializer_list<LLT> Types0,
std::initializer_list<LLT> Types1,
std::initializer_list<LLT> Types2) {
return actionForCartesianProduct(LegalizeAction::Custom, Types0, Types1,
Types2);
}

/// Unconditionally custom lower.
LegalizeRuleSet &custom() {
Expand Down
46 changes: 46 additions & 0 deletions llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ class AArch64AsmPrinter : public AsmPrinter {

void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI);

void LowerMOPS(MCStreamer &OutStreamer, const MachineInstr &MI);

void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI);
void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
Expand Down Expand Up @@ -936,6 +938,43 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer,
.addImm(Size == 4 ? 0 : 2));
}

void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer,
const llvm::MachineInstr &MI) {
unsigned Opcode = MI.getOpcode();
assert(STI->hasMOPS());
assert(STI->hasMTE() || Opcode != AArch64::MOPSMemorySetTaggingPseudo);

const auto Ops = [Opcode]() -> std::array<unsigned, 3> {
if (Opcode == AArch64::MOPSMemoryCopyPseudo)
return {AArch64::CPYFP, AArch64::CPYFM, AArch64::CPYFE};
if (Opcode == AArch64::MOPSMemoryMovePseudo)
return {AArch64::CPYP, AArch64::CPYM, AArch64::CPYE};
if (Opcode == AArch64::MOPSMemorySetPseudo)
return {AArch64::SETP, AArch64::SETM, AArch64::SETE};
if (Opcode == AArch64::MOPSMemorySetTaggingPseudo)
return {AArch64::SETGP, AArch64::SETGM, AArch64::MOPSSETGE};
llvm_unreachable("Unhandled memory operation pseudo");
}();
const bool IsSet = Opcode == AArch64::MOPSMemorySetPseudo ||
Opcode == AArch64::MOPSMemorySetTaggingPseudo;

for (auto Op : Ops) {
int i = 0;
auto MCIB = MCInstBuilder(Op);
// Destination registers
MCIB.addReg(MI.getOperand(i++).getReg());
MCIB.addReg(MI.getOperand(i++).getReg());
if (!IsSet)
MCIB.addReg(MI.getOperand(i++).getReg());
// Input registers
MCIB.addReg(MI.getOperand(i++).getReg());
MCIB.addReg(MI.getOperand(i++).getReg());
MCIB.addReg(MI.getOperand(i++).getReg());

EmitToStreamer(OutStreamer, MCIB);
}
}

void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
Expand Down Expand Up @@ -1363,6 +1402,13 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
emitFMov0(*MI);
return;

case AArch64::MOPSMemoryCopyPseudo:
case AArch64::MOPSMemoryMovePseudo:
case AArch64::MOPSMemorySetPseudo:
case AArch64::MOPSMemorySetTaggingPseudo:
LowerMOPS(*OutStreamer, *MI);
return;

case TargetOpcode::STACKMAP:
return LowerSTACKMAP(*OutStreamer, SM, *MI);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
case AArch64::JumpTableDest32:
case AArch64::JumpTableDest16:
case AArch64::JumpTableDest8:
case AArch64::MOPSMemoryCopyPseudo:
case AArch64::MOPSMemoryMovePseudo:
case AArch64::MOPSMemorySetPseudo:
case AArch64::MOPSMemorySetTaggingPseudo:
NumBytes = 12;
break;
case AArch64::SPACE:
Expand Down
21 changes: 21 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -8362,6 +8362,27 @@ let Predicates = [HasMOPS, HasMTE] in {
}
}

let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in {
let mayLoad = 1 in {
def MOPSMemoryCopyPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
[], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
def MOPSMemoryMovePseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
[], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
}
let mayLoad = 0 in {
def MOPSMemorySetPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
[], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
}
}
let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, mayStore = 1 in {
def MOPSMemorySetTaggingPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
[], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
}

let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in
def StoreSwiftAsyncContext
: Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
Expand Down
95 changes: 95 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ class AArch64InstructionSelector : public InstructionSelector {
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);

unsigned emitConstantPoolEntry(const Constant *CPVal,
Expand Down Expand Up @@ -3424,6 +3425,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_VECREDUCE_FADD:
case TargetOpcode::G_VECREDUCE_ADD:
return selectReduction(I, MRI);
case TargetOpcode::G_MEMCPY:
case TargetOpcode::G_MEMCPY_INLINE:
case TargetOpcode::G_MEMMOVE:
case TargetOpcode::G_MEMSET:
assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
return selectMOPS(I, MRI);
}

return false;
Expand Down Expand Up @@ -3481,6 +3488,64 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
return false;
}

bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
MachineRegisterInfo &MRI) {
unsigned Mopcode;
switch (GI.getOpcode()) {
case TargetOpcode::G_MEMCPY:
case TargetOpcode::G_MEMCPY_INLINE:
Mopcode = AArch64::MOPSMemoryCopyPseudo;
break;
case TargetOpcode::G_MEMMOVE:
Mopcode = AArch64::MOPSMemoryMovePseudo;
break;
case TargetOpcode::G_MEMSET:
// For tagged memset see llvm.aarch64.mops.memset.tag
Mopcode = AArch64::MOPSMemorySetPseudo;
break;
}

auto &DstPtr = GI.getOperand(0);
auto &SrcOrVal = GI.getOperand(1);
auto &Size = GI.getOperand(2);

// Create copies of the registers that can be clobbered.
const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());

const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
const auto &SrcValRegClass =
IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;

// Constrain to specific registers
RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);

MIB.buildCopy(DstPtrCopy, DstPtr);
MIB.buildCopy(SrcValCopy, SrcOrVal);
MIB.buildCopy(SizeCopy, Size);

// New instruction uses the copied registers because it must update them.
// The defs are not used since they don't exist in G_MEM*. They are still
// tied.
// Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
if (IsSet) {
MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
{DstPtrCopy, SizeCopy, SrcValCopy});
} else {
Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
{DstPtrCopy, SrcValCopy, SizeCopy});
}

GI.eraseFromParent();
return true;
}

bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
Expand Down Expand Up @@ -5375,6 +5440,36 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
break;
}
case Intrinsic::aarch64_mops_memset_tag: {
// Transform
// %dst:gpr(p0) = \
// G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
// \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
// where %dst is updated, into
// %Rd:GPR64common, %Rn:GPR64) = \
// MOPSMemorySetTaggingPseudo \
// %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
// where Rd and Rn are tied.
// It is expected that %val has been extended to s64 in legalization.
// Note that the order of the size/value operands are swapped.

Register DstDef = I.getOperand(0).getReg();
// I.getOperand(1) is the intrinsic function
Register DstUse = I.getOperand(2).getReg();
Register ValUse = I.getOperand(3).getReg();
Register SizeUse = I.getOperand(4).getReg();

// MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
// Therefore an additional virtual register is requried for the updated size
// operand. This value is not accessible via the semantics of the intrinsic.
Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));

auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
{DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
Memset.cloneMemRefs(I);
constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
break;
}
}

I.eraseFromParent();
Expand Down
55 changes: 53 additions & 2 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,8 +699,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)

getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();

getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
.libcall();
if (ST.hasMOPS()) {
// G_BZERO is not supported. Currently it is only emitted by
// PreLegalizerCombiner for G_MEMSET with zero constant.
getActionDefinitionsBuilder(G_BZERO).unsupported();

getActionDefinitionsBuilder(G_MEMSET)
.legalForCartesianProduct({p0}, {s64}, {s64})
.customForCartesianProduct({p0}, {s8}, {s64})
.immIdx(0); // Inform verifier imm idx 0 is handled.

getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
.legalForCartesianProduct({p0}, {p0}, {s64})
.immIdx(0); // Inform verifier imm idx 0 is handled.

// G_MEMCPY_INLINE does not have a tailcall immediate
getActionDefinitionsBuilder(G_MEMCPY_INLINE)
.legalForCartesianProduct({p0}, {p0}, {s64});

} else {
getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
.libcall();
}

// FIXME: Legal types are only legal with NEON.
getActionDefinitionsBuilder(G_ABS)
Expand Down Expand Up @@ -832,6 +852,11 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeAtomicCmpxchg128(MI, MRI, Helper);
case TargetOpcode::G_CTTZ:
return legalizeCTTZ(MI, Helper);
case TargetOpcode::G_BZERO:
case TargetOpcode::G_MEMCPY:
case TargetOpcode::G_MEMMOVE:
case TargetOpcode::G_MEMSET:
return legalizeMemOps(MI, Helper);
}

llvm_unreachable("expected switch to return");
Expand Down Expand Up @@ -989,6 +1014,15 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
}
case Intrinsic::aarch64_mops_memset_tag: {
assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
// Zext the value to 64 bit
MachineIRBuilder MIB(MI);
auto &Value = MI.getOperand(3);
Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
Value.setReg(ZExtValueReg);
return true;
}
}

return true;
Expand Down Expand Up @@ -1359,3 +1393,20 @@ bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
MI.eraseFromParent();
return true;
}

bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
LegalizerHelper &Helper) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;

// Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
// Zext the value operand to 64 bit
auto &Value = MI.getOperand(1);
Register ZExtValueReg =
MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
Value.setReg(ZExtValueReg);
return true;
}

return false;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class AArch64LegalizerInfo : public LegalizerInfo {
bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const;
bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const;
const AArch64Subtarget *ST;
};
} // End llvm namespace.
Expand Down
Loading

0 comments on commit 78fd413

Please sign in to comment.