From 528943f1535b925ce175afb2438cec79513cfc2b Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Tue, 9 Apr 2024 17:27:46 +0100 Subject: [PATCH] [AArch64][SME] Allow memory operations lowering to custom SME functions. (#79263) This change allows to lower memcpy, memset, memmove to custom SME version provided by LibRT. --- .../AArch64/AArch64SelectionDAGInfo.cpp | 87 +++++- .../Target/AArch64/AArch64SelectionDAGInfo.h | 5 + .../AArch64/Utils/AArch64SMEAttributes.cpp | 3 + .../streaming-compatible-memory-ops.ll | 289 ++++++++++++++++++ 4 files changed, 380 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 9e43f206efcf78..19ef6f4fb32e74 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -15,6 +15,12 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" +static cl::opt + LowerToSMERoutines("aarch64-lower-to-sme-routines", cl::Hidden, + cl::desc("Enable AArch64 SME memory operations " + "to lower to librt functions"), + cl::init(true)); + SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode, SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, @@ -76,15 +82,79 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode, } } +SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, RTLIB::Libcall LC) const { + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget(); + const AArch64TargetLowering *TLI = STI.getTargetLowering(); + SDValue Symbol; + TargetLowering::ArgListEntry DstEntry; + DstEntry.Ty = PointerType::getUnqual(*DAG.getContext()); + DstEntry.Node = Dst; + TargetLowering::ArgListTy Args; + Args.push_back(DstEntry); + EVT PointerVT = TLI->getPointerTy(DAG.getDataLayout()); + + switch (LC) { + case RTLIB::MEMCPY: { + TargetLowering::ArgListEntry Entry; + Entry.Ty = PointerType::getUnqual(*DAG.getContext()); + Symbol = DAG.getExternalSymbol("__arm_sc_memcpy", PointerVT); + Entry.Node = Src; + Args.push_back(Entry); + break; + } + case RTLIB::MEMMOVE: { + TargetLowering::ArgListEntry Entry; + Entry.Ty = PointerType::getUnqual(*DAG.getContext()); + Symbol = DAG.getExternalSymbol("__arm_sc_memmove", PointerVT); + Entry.Node = Src; + Args.push_back(Entry); + break; + } + case RTLIB::MEMSET: { + TargetLowering::ArgListEntry Entry; + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Symbol = DAG.getExternalSymbol("__arm_sc_memset", PointerVT); + Src = DAG.getZExtOrTrunc(Src, DL, MVT::i32); + Entry.Node = Src; + Args.push_back(Entry); + break; + } + default: + return SDValue(); + } + + TargetLowering::ArgListEntry SizeEntry; + SizeEntry.Node = Size; + SizeEntry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Args.push_back(SizeEntry); + assert(Symbol->getOpcode() == ISD::ExternalSymbol && + "Function name is not set"); + + TargetLowering::CallLoweringInfo CLI(DAG); + PointerType *RetTy = PointerType::getUnqual(*DAG.getContext()); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + TLI->getLibcallCallingConv(LC), RetTy, Symbol, std::move(Args)); + return TLI->LowerCallTo(CLI).second; +} + SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); + if (STI.hasMOPS()) return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size, Alignment, isVolatile, DstPtrInfo, SrcPtrInfo); + + SMEAttrs Attrs(DAG.getMachineFunction().getFunction()); + if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) + return EmitStreamingCompatibleMemLibCall(DAG, DL, Chain, Dst, Src, Size, + RTLIB::MEMCPY); return SDValue(); } @@ -95,10 +165,14 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); - if (STI.hasMOPS()) { + if (STI.hasMOPS()) return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size, Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{}); - } + + SMEAttrs Attrs(DAG.getMachineFunction().getFunction()); + if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) + return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size, + RTLIB::MEMSET); return SDValue(); } @@ -108,10 +182,15 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); - if (STI.hasMOPS()) { + + if (STI.hasMOPS()) return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size, Alignment, isVolatile, DstPtrInfo, SrcPtrInfo); - } + + SMEAttrs Attrs(DAG.getMachineFunction().getFunction()); + if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) + return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size, + RTLIB::MEMMOVE); return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 73f93724d6fc73..514de44778630e 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -47,6 +47,11 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { SDValue Chain, SDValue Op1, SDValue Op2, MachinePointerInfo DstPtrInfo, bool ZeroData) const override; + + SDValue EmitStreamingCompatibleMemLibCall(SelectionDAG &DAG, const SDLoc &DL, + SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, + RTLIB::Libcall LC) const; }; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index d399e0ac0794f6..015ca4cb92b25e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -53,6 +53,9 @@ SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) { if (FuncName == "__arm_tpidr2_restore") Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | SMEAttrs::SME_ABI_Routine; + if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || + FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") + Bitmask |= SMEAttrs::SM_Compatible; } SMEAttrs::SMEAttrs(const AttributeList &Attrs) { diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll new file mode 100644 index 00000000000000..c39894c27d9d4d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -0,0 +1,289 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS + +@dst = global [512 x i8] zeroinitializer, align 1 +@src = global [512 x i8] zeroinitializer, align 1 + +define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { +; CHECK-LABEL: se_memcpy: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: adrp x0, :got:dst +; CHECK-NEXT: adrp x1, :got:src +; CHECK-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NEXT: bl __arm_sc_memcpy +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: se_memcpy: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: adrp x8, :got:src +; CHECK-MOPS-NEXT: adrp x9, :got:dst +; CHECK-MOPS-NEXT: ldr x8, [x8, :got_lo12:src] +; CHECK-MOPS-NEXT: ldr x9, [x9, :got_lo12:dst] +; CHECK-MOPS-NEXT: cpyfp [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false) + ret void +} + +define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { +; CHECK-LABEL: se_memset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: adrp x0, :got:dst +; CHECK-NEXT: mov w1, #2 // =0x2 +; CHECK-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NEXT: bl __arm_sc_memset +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: se_memset: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2 +; CHECK-NO-SME-ROUTINES-NEXT: bl memset +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: se_memset: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: adrp x8, :got:dst +; CHECK-MOPS-NEXT: mov w9, #2 // =0x2 +; CHECK-MOPS-NEXT: ldr x8, [x8, :got_lo12:dst] +; CHECK-MOPS-NEXT: setp [x8]!, x0!, x9 +; CHECK-MOPS-NEXT: setm [x8]!, x0!, x9 +; CHECK-MOPS-NEXT: sete [x8]!, x0!, x9 +; CHECK-MOPS-NEXT: ret +entry: + tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false) + ret void +} + +define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind { +; CHECK-LABEL: se_memmove: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: adrp x0, :got:dst +; CHECK-NEXT: adrp x1, :got:src +; CHECK-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NEXT: bl __arm_sc_memmove +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: se_memmove: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: bl memmove +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: se_memmove: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: adrp x8, :got:src +; CHECK-MOPS-NEXT: adrp x9, :got:dst +; CHECK-MOPS-NEXT: ldr x8, [x8, :got_lo12:src] +; CHECK-MOPS-NEXT: ldr x9, [x9, :got_lo12:dst] +; CHECK-MOPS-NEXT: cpyp [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpym [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpye [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: ret +entry: + tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false) + ret void +} + +define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { +; CHECK-LABEL: sc_memcpy: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: adrp x0, :got:dst +; CHECK-NEXT: adrp x1, :got:src +; CHECK-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NEXT: bl __arm_sc_memcpy +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state +; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1 +; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2 +; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_2: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: mov x0, x8 +; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_4 +; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: sc_memcpy: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: adrp x8, :got:src +; CHECK-MOPS-NEXT: adrp x9, :got:dst +; CHECK-MOPS-NEXT: ldr x8, [x8, :got_lo12:src] +; CHECK-MOPS-NEXT: ldr x9, [x9, :got_lo12:dst] +; CHECK-MOPS-NEXT: cpyfp [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false) + ret void +} + +define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: sb_memcpy: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: adrp x0, :got:dst +; CHECK-NEXT: adrp x1, :got:src +; CHECK-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NEXT: bl __arm_sc_memcpy +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: sb_memcpy: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: smstart sm +; CHECK-MOPS-NEXT: adrp x8, :got:src +; CHECK-MOPS-NEXT: adrp x9, :got:dst +; CHECK-MOPS-NEXT: ldr x8, [x8, :got_lo12:src] +; CHECK-MOPS-NEXT: ldr x9, [x9, :got_lo12:dst] +; CHECK-MOPS-NEXT: cpyfp [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0! +; CHECK-MOPS-NEXT: smstop sm +; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false) + ret void +} + +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) +declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)