Skip to content

Commit

Permalink
Re-apply r214881: Fix return sequence on armv4 thumb
Browse files Browse the repository at this point in the history
This reverts r214893, re-applying r214881 with the test case relaxed a bit to
satiate the build bots.

POP on armv4t cannot be used to change thumb state (unilke later non-m-class
architectures), therefore we need a different return sequence that uses 'bx'
instead:

  POP {r3}
  ADD sp, #offset
  BX r3

This patch also fixes an issue where the return value in r3 would get clobbered
for functions that return 128 bits of data. In that case, we generate this
sequence instead:

  MOV ip, r3
  POP {r3}
  ADD sp, #offset
  MOV lr, r3
  MOV r3, ip
  BX lr

http://reviews.llvm.org/D4748



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214928 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
jroelofs committed Aug 5, 2014
1 parent 416ea4b commit b23c2d9
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 20 deletions.
4 changes: 4 additions & 0 deletions lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2113,6 +2113,10 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
bool isLittleEndian = Subtarget->isLittle();

MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
AFI->setReturnRegsCount(RVLocs.size());

// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
Expand Down
9 changes: 8 additions & 1 deletion lib/Target/ARM/ARMMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
///
unsigned ArgRegsSaveSize;

/// ReturnRegsCount - Number of registers used up in the return.
unsigned ReturnRegsCount;

/// HasStackFrame - True if this function has a stack frame. Set by
/// processFunctionBeforeCalleeSavedScan().
bool HasStackFrame;
Expand Down Expand Up @@ -127,7 +130,8 @@ class ARMFunctionInfo : public MachineFunctionInfo {
ARMFunctionInfo() :
isThumb(false),
hasThumb2(false),
ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
RestoreSPFromFP(false),
LRSpilledForFarJump(false),
FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
Expand All @@ -151,6 +155,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
}
void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }

unsigned getReturnRegsCount() const { return ReturnRegsCount; }
void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; }

bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }

Expand Down
78 changes: 59 additions & 19 deletions lib/Target/ARM/Thumb1FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,28 +388,65 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
}
}

if (ArgRegsSaveSize) {
// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
// to LR, and we can't pop the value directly to the PC since
// we need to update the SP after popping the value. Therefore, we
// pop the old LR into R3 as a temporary.

bool IsV4PopReturn = false;
for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo())
if (CSI.getReg() == ARM::LR)
IsV4PopReturn = true;
IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps();

// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
// to LR, and we can't pop the value directly to the PC since
// we need to update the SP after popping the value. So instead
// we have to emit:
// POP {r3}
// ADD sp, #offset
// BX r3
// If this would clobber a return value, then generate this sequence instead:
// MOV ip, r3
// POP {r3}
// ADD sp, #offset
// MOV lr, r3
// MOV r3, ip
// BX lr
if (ArgRegsSaveSize || IsV4PopReturn) {
// Get the last instruction, tBX_RET
MBBI = MBB.getLastNonDebugInstr();
assert (MBBI->getOpcode() == ARM::tBX_RET);
// Epilogue for vararg functions: pop LR to R3 and branch off it.
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
.addReg(ARM::R3, RegState::Define);

emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);

MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
.addReg(ARM::R3, RegState::Kill);
AddDefaultPred(MIB);
MIB.copyImplicitOps(&*MBBI);
// erase the old tBX_RET instruction
MBB.erase(MBBI);
DebugLoc dl = MBBI->getDebugLoc();

if (AFI->getReturnRegsCount() <= 3) {
// Epilogue: pop saved LR to R3 and branch off it.
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
.addReg(ARM::R3, RegState::Define);

emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);

MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
.addReg(ARM::R3, RegState::Kill);
AddDefaultPred(MIB);
MIB.copyImplicitOps(&*MBBI);
// erase the old tBX_RET instruction
MBB.erase(MBBI);
} else {
AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
.addReg(ARM::R12, RegState::Define)
.addReg(ARM::R3, RegState::Kill));

AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
.addReg(ARM::R3, RegState::Define);

emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);

AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
.addReg(ARM::LR, RegState::Define)
.addReg(ARM::R3, RegState::Kill));

AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
.addReg(ARM::R3, RegState::Define)
.addReg(ARM::R12, RegState::Kill));
// Keep the tBX_RET instruction
}
}
}

Expand Down Expand Up @@ -476,6 +513,9 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
// Special epilogue for vararg functions. See emitEpilogue
if (isVarArg)
continue;
// ARMv4T requires BX, see emitEpilogue
if (STI.hasV4TOps() && !STI.hasV5TOps())
continue;
Reg = ARM::PC;
(*MIB).setDesc(TII.get(ARM::tPOP_RET));
MIB.copyImplicitOps(&*MI);
Expand Down
212 changes: 212 additions & 0 deletions test/CodeGen/ARM/thumb1_return_sequence.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V4T
; RUN: llc -mtriple=thumbv5t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V5T

; CHECK-V4T-LABEL: clobberframe
; CHECK-V5T-LABEL: clobberframe
define <4 x i32> @clobberframe() #0 {
entry:
; Prologue
; --------
; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V4T: sub sp,
; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr}

%b = alloca <4 x i32>, align 16
%a = alloca <4 x i32>, align 16
store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
%0 = load <4 x i32>* %a, align 16
ret <4 x i32> %0

; Epilogue
; --------
; CHECK-V4T: add sp,
; CHECK-V4T-NEXT: pop {[[SAVED]]}
; CHECK-V4T-NEXT: mov r12, r3
; CHECK-V4T-NEXT: pop {r3}
; CHECK-V4T-NEXT: mov lr, r3
; CHECK-V4T-NEXT: mov r3, r12
; CHECK-V4T: bx lr
; CHECK-V5T: pop {[[SAVED]], pc}
}

; CHECK-V4T-LABEL: clobbervariadicframe
; CHECK-V5T-LABEL: clobbervariadicframe
define <4 x i32> @clobbervariadicframe(i32 %i, ...) #0 {
entry:
; Prologue
; --------
; CHECK-V4T: sub sp,
; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V5T: sub sp,
; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr}

%b = alloca <4 x i32>, align 16
%a = alloca <4 x i32>, align 16
store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
%0 = load <4 x i32>* %a, align 16
ret <4 x i32> %0

; Epilogue
; --------
; CHECK-V4T: pop {[[SAVED]]}
; CHECK-V4T-NEXT: mov r12, r3
; CHECK-V4T-NEXT: pop {r3}
; CHECK-V4T-NEXT: add sp,
; CHECK-V4T-NEXT: mov lr, r3
; CHECK-V4T-NEXT: mov r3, r12
; CHECK-V4T: bx lr
; CHECK-V5T: add sp,
; CHECK-V5T-NEXT: pop {[[SAVED]]}
; CHECK-V5T-NEXT: mov r12, r3
; CHECK-V5T-NEXT: pop {r3}
; CHECK-V5T-NEXT: add sp,
; CHECK-V5T-NEXT: mov lr, r3
; CHECK-V5T-NEXT: mov r3, r12
; CHECK-V5T-NEXT: bx lr
}

; CHECK-V4T-LABEL: simpleframe
; CHECK-V5T-LABEL: simpleframe
define i32 @simpleframe() #0 {
entry:
; Prologue
; --------
; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr}

%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%d = alloca i32, align 4
store i32 1, i32* %a, align 4
store i32 2, i32* %b, align 4
store i32 3, i32* %c, align 4
store i32 4, i32* %d, align 4
%0 = load i32* %a, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* %a, align 4
%1 = load i32* %b, align 4
%inc1 = add nsw i32 %1, 1
store i32 %inc1, i32* %b, align 4
%2 = load i32* %c, align 4
%inc2 = add nsw i32 %2, 1
store i32 %inc2, i32* %c, align 4
%3 = load i32* %d, align 4
%inc3 = add nsw i32 %3, 1
store i32 %inc3, i32* %d, align 4
%4 = load i32* %a, align 4
%5 = load i32* %b, align 4
%add = add nsw i32 %4, %5
%6 = load i32* %c, align 4
%add4 = add nsw i32 %add, %6
%7 = load i32* %d, align 4
%add5 = add nsw i32 %add4, %7
ret i32 %add5

; Epilogue
; --------
; CHECK-V4T: pop {[[SAVED]]}
; CHECK-V4T: pop {r3}
; CHECK-V4T: bx r3
; CHECK-V5T: pop {[[SAVED]], pc}
}

; CHECK-V4T-LABEL: simplevariadicframe
; CHECK-V5T-LABEL: simplevariadicframe
define i32 @simplevariadicframe(i32 %i, ...) #0 {
entry:
; Prologue
; --------
; CHECK-V4T: sub sp,
; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V4T: sub sp,
; CHECK-V5T: sub sp,
; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V5T: sub sp,

%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%d = alloca i32, align 4
store i32 1, i32* %a, align 4
store i32 2, i32* %b, align 4
store i32 3, i32* %c, align 4
store i32 4, i32* %d, align 4
%0 = load i32* %a, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* %a, align 4
%1 = load i32* %b, align 4
%inc1 = add nsw i32 %1, 1
store i32 %inc1, i32* %b, align 4
%2 = load i32* %c, align 4
%inc2 = add nsw i32 %2, 1
store i32 %inc2, i32* %c, align 4
%3 = load i32* %d, align 4
%inc3 = add nsw i32 %3, 1
store i32 %inc3, i32* %d, align 4
%4 = load i32* %a, align 4
%5 = load i32* %b, align 4
%add = add nsw i32 %4, %5
%6 = load i32* %c, align 4
%add4 = add nsw i32 %add, %6
%7 = load i32* %d, align 4
%add5 = add nsw i32 %add4, %7
%add6 = add nsw i32 %add5, %i
ret i32 %add6

; Epilogue
; --------
; CHECK-V4T: add sp,
; CHECK-V4T-NEXT: pop {[[SAVED]]}
; CHECK-V4T-NEXT: pop {r3}
; CHECK-V4T-NEXT: add sp,
; CHECK-V4T-NEXT: bx r3
; CHECK-V5T: add sp,
; CHECK-V5T-NEXT: pop {[[SAVED]]}
; CHECK-V5T-NEXT: pop {r3}
; CHECK-V5T-NEXT: add sp,
; CHECK-V5T-NEXT: bx r3
}

; CHECK-V4T-LABEL: noframe
; CHECK-V5T-LABEL: noframe
define i32 @noframe() #0 {
entry:
; Prologue
; --------
; CHECK-V4T-NOT: push
; CHECK-V5T-NOT: push
ret i32 0;
; Epilogue
; --------
; CHECK-V4T-NOT: pop
; CHECK-V5T-NOT: pop
; CHECK-V4T: bx lr
; CHECK-V5T: bx lr
}

; CHECK-V4T-LABEL: novariadicframe
; CHECK-V5T-LABEL: novariadicframe
define i32 @novariadicframe(i32 %i, ...) #0 {
entry:
; Prologue
; --------
; CHECK-V4T: sub sp,
; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr}
; CHECK-V5T: sub sp,
; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr}

ret i32 %i;
; Epilogue
; --------
; CHECK-V4T: pop {[[SAVED]]}
; CHECK-V4T-NEXT: pop {r3}
; CHECK-V4T-NEXT: add sp,
; CHECK-V4T-NEXT: bx r3
; CHECK-V5T: pop {[[SAVED]]}
; CHECK-V5T-NEXT: pop {r3}
; CHECK-V5T-NEXT: add sp,
; CHECK-V5T-NEXT: bx r3
}

0 comments on commit b23c2d9

Please sign in to comment.