Skip to content

Commit

Permalink
Pad Short Functions for Intel Atom
Browse files Browse the repository at this point in the history
The current Intel Atom microarchitecture has a feature whereby
when a function returns early then it is slightly faster to execute
a sequence of NOP instructions to wait until the return address is ready,
as opposed to simply stalling on the ret instruction until
the return address is ready.

When compiling for X86 Atom only, this patch will run a pass,
called "X86PadShortFunction" which will add NOP instructions where less
than four cycles elapse between function entry and return.

It includes tests.

This patch has been updated to address Nadav's review comments
- Optimize only at >= O1 and don't do optimization if -Os is set
- Stores MachineBasicBlock* instead of BBNum
- Uses DenseMap instead of std::map
- Fixes placement of braces

Patch by Andy Zhang.




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171879 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
pgurd committed Jan 8, 2013
1 parent 2a92c10 commit c7b902e
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 8 deletions.
1 change: 1 addition & 0 deletions lib/Target/X86/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ set(sources
X86JITInfo.cpp
X86MCInstLower.cpp
X86MachineFunctionInfo.cpp
X86PadShortFunction.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
X86Subtarget.cpp
Expand Down
4 changes: 4 additions & 0 deletions lib/Target/X86/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ FunctionPass *createEmitX86CodeToMemory();
/// \brief Creates an X86-specific Target Transformation Info pass.
ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);

/// createX86PadShortFunctions - Return a pass that pads short functions
/// with NOOPs. This will prevent a stall when returning on the Atom.
FunctionPass *createX86PadShortFunctions();

} // End llvm namespace

#endif
9 changes: 6 additions & 3 deletions lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,11 @@ def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
"HasSlowDivide", "true",
"Use small divide for positive values less than 256">;
"HasSlowDivide", "true",
"Use small divide for positive values less than 256">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;

//===----------------------------------------------------------------------===//
// X86 processors supported.
Expand Down Expand Up @@ -167,7 +170,7 @@ def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
FeatureSlowDivide]>;
FeatureSlowDivide, FeaturePadShortFunctions]>;
// "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureSlowBTMem, FeatureFastUAMem,
Expand Down
171 changes: 171 additions & 0 deletions lib/Target/X86/X86PadShortFunction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass which will pad short functions to prevent
// a stall if a function returns before the return address is ready. This
// is needed for some Intel Atom processors.
//
//===----------------------------------------------------------------------===//

#include <algorithm>

#define DEBUG_TYPE "x86-pad-short-functions"
#include "X86.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"

using namespace llvm;

STATISTIC(NumBBsPadded, "Number of basic blocks padded");

namespace {
struct PadShortFunc : public MachineFunctionPass {
static char ID;
PadShortFunc() : MachineFunctionPass(ID)
, Threshold(4), TM(0), TII(0) {}

virtual bool runOnMachineFunction(MachineFunction &MF);

virtual const char *getPassName() const {
return "X86 Atom pad short functions";
}

private:
void findReturns(MachineBasicBlock *MBB,
unsigned int Cycles = 0);

bool cyclesUntilReturn(MachineBasicBlock *MBB,
unsigned int &Cycles,
MachineBasicBlock::iterator *Location = 0);

void addPadding(MachineBasicBlock *MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd);

const unsigned int Threshold;
DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;

const TargetMachine *TM;
const TargetInstrInfo *TII;
};

char PadShortFunc::ID = 0;
}

FunctionPass *llvm::createX86PadShortFunctions() {
return new PadShortFunc();
}

/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// NOOP instructions before early exits.
bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
bool OptForSize = MF.getFunction()->getAttributes().
hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);

if (OptForSize)
return false;

TM = &MF.getTarget();
TII = TM->getInstrInfo();

// Search through basic blocks and mark the ones that have early returns
ReturnBBs.clear();
findReturns(MF.begin());

bool MadeChange = false;

MachineBasicBlock::iterator ReturnLoc;
MachineBasicBlock *MBB;
unsigned int Cycles = 0;
unsigned int BBCycles;

// Pad the identified basic blocks with NOOPs
for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
I != ReturnBBs.end(); ++I) {
MBB = I->first;
Cycles = I->second;

if (Cycles < Threshold) {
if (!cyclesUntilReturn(MBB, BBCycles, &ReturnLoc))
continue;

addPadding(MBB, ReturnLoc, Threshold - Cycles);
NumBBsPadded++;
MadeChange = true;
}
}

return MadeChange;
}

/// findReturn - Starting at MBB, follow control flow and add all
/// basic blocks that contain a return to ReturnBBs.
void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
// If this BB has a return, note how many cycles it takes to get there.
bool hasReturn = cyclesUntilReturn(MBB, Cycles);
if (Cycles >= Threshold)
return;

if (hasReturn) {
ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
return;
}

// Follow branches in BB and look for returns
for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
I != MBB->succ_end(); ++I) {
findReturns(*I, Cycles);
}
}

/// cyclesUntilReturn - if the MBB has a return instruction, set Location
/// to the instruction and return true. Return false otherwise.
/// Cycles will be incremented by the number of cycles taken to reach the
/// return or the end of the BB, whichever occurs first.
bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
unsigned int &Cycles,
MachineBasicBlock::iterator *Location) {
for (MachineBasicBlock::iterator MBBI = MBB->begin();
MBBI != MBB->end(); ++MBBI) {
MachineInstr *MI = MBBI;
// Mark basic blocks with a return instruction. Calls to other
// functions do not count because the called function will be padded,
// if necessary.
if (MI->isReturn() && !MI->isCall()) {
if (Location)
*Location = MBBI;
return true;
}

Cycles += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
}

return false;
}

/// addPadding - Add the given number of NOOP instructions to the function
/// just prior to the return at MBBI
void PadShortFunc::addPadding(MachineBasicBlock *MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd) {
DebugLoc DL = MBBI->getDebugLoc();

while (NOOPsToAdd-- > 0) {
BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
}
}
1 change: 1 addition & 0 deletions lib/Target/X86/X86Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, UseLeaForSP(false)
, HasSlowDivide(false)
, PostRAScheduler(false)
, PadShortFunctions(false)
, stackAlignment(4)
// FIXME: this is a known good value for Yonah. How about others?
, MaxInlineSizeThreshold(128)
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/X86/X86Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ class X86Subtarget : public X86GenSubtargetInfo {
/// PostRAScheduler - True if using post-register-allocation scheduler.
bool PostRAScheduler;

/// PadShortFunctions - True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;

/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
Expand Down Expand Up @@ -231,6 +235,7 @@ class X86Subtarget : public X86GenSubtargetInfo {
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasSlowDivide() const { return HasSlowDivide; }
bool padShortFunctions() const { return PadShortFunctions; }

bool isAtom() const { return X86ProcFamily == IntelAtom; }

Expand Down
6 changes: 6 additions & 0 deletions lib/Target/X86/X86TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ bool X86PassConfig::addPreEmitPass() {
ShouldPrint = true;
}

if (getOptLevel() != CodeGenOpt::None &&
getX86Subtarget().padShortFunctions()) {
addPass(createX86PadShortFunctions());
ShouldPrint = true;
}

return ShouldPrint;
}

Expand Down
78 changes: 78 additions & 0 deletions test/CodeGen/X86/atom-pad-short-functions.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
; RUN: llc < %s -O1 -mcpu=atom -mtriple=i686-linux | FileCheck %s

declare void @external_function(...)

define i32 @test_return_val(i32 %a) nounwind {
; CHECK: test_return_val
; CHECK: movl
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
ret i32 %a
}

define i32 @test_optsize(i32 %a) nounwind optsize {
; CHECK: test_optsize
; CHECK: movl
; CHECK-NEXT: ret
ret i32 %a
}

define i32 @test_add(i32 %a, i32 %b) nounwind {
; CHECK: test_add
; CHECK: addl
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
%result = add i32 %a, %b
ret i32 %result
}

define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
; CHECK: @test_multiple_ret
; CHECK: je

; CHECK: nop
; CHECK: nop
; CHECK: ret

; CHECK: nop
; CHECK: nop
; CHECK: ret

br i1 %c, label %bb1, label %bb2

bb1:
ret i32 %a

bb2:
ret i32 %b
}

define void @test_call_others(i32 %x) nounwind
{
; CHECK: test_call_others
; CHECK: je
%tobool = icmp eq i32 %x, 0
br i1 %tobool, label %if.end, label %true.case

; CHECK: jmp external_function
true.case:
tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
br label %if.end

; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
if.end:
ret void

}
4 changes: 2 additions & 2 deletions test/CodeGen/X86/fast-isel-x86-64.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/X86/ret-mmx.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s
; rdar://6602459

@g_v1di = external global <1 x i64>
Expand Down
4 changes: 2 additions & 2 deletions test/CodeGen/X86/select.ll
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM: test13:
; ATOM: cmpl
; ATOM-NEXT: sbbl
; ATOM-NEXT: ret
; ATOM: ret
}

define i32 @test14(i32 %a, i32 %b) nounwind {
Expand All @@ -299,7 +299,7 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; ATOM: cmpl
; ATOM-NEXT: sbbl
; ATOM-NEXT: notl
; ATOM-NEXT: ret
; ATOM: ret
}

; rdar://10961709
Expand Down

0 comments on commit c7b902e

Please sign in to comment.