Skip to content

Commit

Permalink
[RegisterCoalescing] Remove partial redundent copy.
Browse files Browse the repository at this point in the history
The patch is to solve the performance problem described in PR27827.
Register coalescing sometimes cannot remove a copy because of interference.
But if we can find a reverse copy in one of the predecessor block of the copy,
the copy is partially redundent and we may remove the copy partially by moving
it to the predecessor block without the reverse copy.

Differential Revision: https://reviews.llvm.org/D28585


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292292 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
wmi-11 committed Jan 17, 2017
1 parent 84b54a1 commit caab16e
Show file tree
Hide file tree
Showing 3 changed files with 344 additions and 0 deletions.
171 changes: 171 additions & 0 deletions lib/CodeGen/RegisterCoalescer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
Expand Down Expand Up @@ -189,6 +190,9 @@ namespace {
/// This returns true if an interval was modified.
bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);

/// We found a copy which can be moved to its less frequent predecessor.
bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);

/// If the source of a copy is defined by a
/// trivial computation, replace the copy by rematerialize the definition.
bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
Expand Down Expand Up @@ -861,6 +865,167 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
return true;
}

/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
/// predecessor of BB2, and if B is not redefined on the way from A = B
/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the
/// execution goes through the path from BB0 to BB2. We may move B = A
/// to the predecessor without such reversed copy.
/// So we will transform the program from:
/// BB0:
/// A = B; BB1:
/// ... ...
/// / \ /
/// BB2:
/// ...
/// B = A;
///
/// to:
///
/// BB0: BB1:
/// A = B; ...
/// ... B = A;
/// / \ /
/// BB2:
/// ...
///
/// A special case is when BB0 and BB2 are the same BB which is the only
/// BB in a loop:
/// BB1:
/// ...
/// BB0/BB2: ----
/// B = A; |
/// ... |
/// A = B; |
/// |-------
/// |
/// We may hoist B = A from BB0/BB2 to BB1.
///
/// The major preconditions for correctness to remove such partial
/// redundancy include:
/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of
/// the PHI is defined by the reversed copy A = B in BB0.
/// 2. No B is referenced from the start of BB2 to B = A.
/// 3. No B is defined from A = B to the end of BB0.
/// 4. BB1 has only one successor.
///
/// 2 and 4 implicitly ensure B is not live at the end of BB1.
/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a
/// colder place, which not only prevent endless loop, but also make sure
/// the movement of copy is beneficial.
bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
MachineInstr &CopyMI) {
assert(!CP.isPhys());
if (!CopyMI.isFullCopy())
return false;

MachineBasicBlock &MBB = *CopyMI.getParent();
if (MBB.isEHPad())
return false;

if (MBB.pred_size() != 2)
return false;

LiveInterval &IntA =
LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
LiveInterval &IntB =
LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());

// A is defined by PHI at the entry of MBB.
SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx);
assert(AValNo && !AValNo->isUnused() && "COPY source not live");
if (!AValNo->isPHIDef())
return false;

// No B is referenced before CopyMI in MBB.
if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx))
return false;

// MBB has two predecessors: one contains A = B so no copy will be inserted
// for it. The other one will have a copy moved from MBB.
bool FoundReverseCopy = false;
MachineBasicBlock *CopyLeftBB = nullptr;
for (MachineBasicBlock *Pred : MBB.predecessors()) {
VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred));
MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def);
if (!DefMI || !DefMI->isFullCopy()) {
CopyLeftBB = Pred;
continue;
}
// Check DefMI is a reverse copy and it is in BB Pred.
if (DefMI->getOperand(0).getReg() != IntA.reg ||
DefMI->getOperand(1).getReg() != IntB.reg ||
DefMI->getParent() != Pred) {
CopyLeftBB = Pred;
continue;
}
// If there is any other def of B after DefMI and before the end of Pred,
// we need to keep the copy of B = A at the end of Pred if we remove
// B = A from MBB.
bool ValB_Changed = false;
for (auto VNI : IntB.valnos) {
if (VNI->isUnused())
continue;
if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) {
ValB_Changed = true;
break;
}
}
if (ValB_Changed) {
CopyLeftBB = Pred;
continue;
}
FoundReverseCopy = true;
}

// If no reverse copy is found in predecessors, nothing to do.
if (!FoundReverseCopy)
return false;

// If CopyLeftBB is nullptr, it means every predecessor of MBB contains
// reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated.
// If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and
// update IntA/IntB.
//
// If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so
// MBB is hotter than CopyLeftBB.
if (CopyLeftBB && CopyLeftBB->succ_size() > 1)
return false;

// Now ok to move copy.
if (CopyLeftBB) {
DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#"
<< CopyLeftBB->getNumber() << '\t' << CopyMI);

// Insert new copy to CopyLeftBB.
auto InsPos = CopyLeftBB->getFirstTerminator();
MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
TII->get(TargetOpcode::COPY), IntB.reg)
.addReg(IntA.reg);
SlotIndex NewCopyIdx =
LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
VNInfo *VNI = IntB.getNextValue(NewCopyIdx, LIS->getVNInfoAllocator());
IntB.createDeadDef(VNI);
} else {
DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#"
<< MBB.getNumber() << '\t' << CopyMI);
}

// Remove CopyMI.
SlotIndex EndPoint = IntB.Query(CopyIdx.getRegSlot()).endPoint();
LIS->removeVRegDefAt(IntB, CopyIdx.getRegSlot());
LIS->RemoveMachineInstrFromMaps(CopyMI);
CopyMI.eraseFromParent();

// Extend IntB to the EndPoint of its original live interval.
SmallVector<SlotIndex, 8> EndPoints;
EndPoints.push_back(EndPoint);
LIS->extendToIndices(IntB, EndPoints);

shrinkToUses(&IntA);
return true;
}

/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
/// defining a subregister.
static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
Expand Down Expand Up @@ -1486,6 +1651,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
}
}

// Try and see if we can partially eliminate the copy by moving the copy to
// its predecessor.
if (!CP.isPartial() && !CP.isPhys())
if (removePartialRedundancy(CP, *CopyMI))
return true;

// Otherwise, we are unable to join the intervals.
DEBUG(dbgs() << "\tInterference!\n");
Again = true; // May be possible to coalesce later.
Expand Down
51 changes: 51 additions & 0 deletions test/CodeGen/X86/pre-coalesce.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
; RUN: llc -regalloc=greedy -mtriple=x86_64-unknown-linux-gnu < %s -o - | FileCheck %s
;
; The test is to check no redundent mov as follows will be generated in %while.body loop.
; .LBB0_2:
; movsbl %cl, %ecx
; movl %edx, %eax ==> This movl can be promoted outside of loop.
; shll $5, %eax
; ...
; movl %eax, %edx
; jne .LBB0_2
;
; CHECK-LABEL: foo:
; CHECK: [[L0:.LBB0_[0-9]+]]: # %while.body
; CHECK: movl %[[REGA:.*]], %[[REGB:.*]]
; CHECK-NOT: movl %[[REGB]], %[[REGA]]
; CHECK: jne [[L0]]
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@b = common local_unnamed_addr global i8* null, align 8
@a = common local_unnamed_addr global i32 0, align 4

define i32 @foo() local_unnamed_addr {
entry:
%t0 = load i8*, i8** @b, align 8
%t1 = load i8, i8* %t0, align 1
%cmp4 = icmp eq i8 %t1, 0
%t2 = load i32, i32* @a, align 4
br i1 %cmp4, label %while.end, label %while.body.preheader

while.body.preheader: ; preds = %entry
br label %while.body

while.body: ; preds = %while.body.preheader, %while.body
%t3 = phi i32 [ %add3, %while.body ], [ %t2, %while.body.preheader ]
%t4 = phi i8 [ %t5, %while.body ], [ %t1, %while.body.preheader ]
%conv = sext i8 %t4 to i32
%add = mul i32 %t3, 33
%add3 = add nsw i32 %add, %conv
store i32 %add3, i32* @a, align 4
%t5 = load i8, i8* %t0, align 1
%cmp = icmp eq i8 %t5, 0
br i1 %cmp, label %while.end.loopexit, label %while.body

while.end.loopexit: ; preds = %while.body
br label %while.end

while.end: ; preds = %while.end.loopexit, %entry
%.lcssa = phi i32 [ %t2, %entry ], [ %add3, %while.end.loopexit ]
ret i32 %.lcssa
}
122 changes: 122 additions & 0 deletions test/CodeGen/X86/pre-coalesce.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass simple-register-coalescing -o - %s | FileCheck %s
# Check there is no partial redundent copy left in the loop after register coalescing.
--- |
; ModuleID = '<stdin>'
source_filename = "<stdin>"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@b = common local_unnamed_addr global i8* null, align 8
@a = common local_unnamed_addr global i32 0, align 4

define i32 @foo() local_unnamed_addr {
entry:
%t0 = load i8*, i8** @b, align 8
%t1 = load i8, i8* %t0, align 1
%cmp4 = icmp eq i8 %t1, 0
%t2 = load i32, i32* @a, align 4
br i1 %cmp4, label %while.end, label %while.body.preheader

while.body.preheader: ; preds = %entry
br label %while.body

while.body: ; preds = %while.body, %while.body.preheader
%t3 = phi i32 [ %add3, %while.body ], [ %t2, %while.body.preheader ]
%t4 = phi i8 [ %t5, %while.body ], [ %t1, %while.body.preheader ]
%conv = sext i8 %t4 to i32
%add = mul i32 %t3, 33
%add3 = add nsw i32 %add, %conv
store i32 %add3, i32* @a, align 4
%t5 = load i8, i8* %t0, align 1
%cmp = icmp eq i8 %t5, 0
br i1 %cmp, label %while.end, label %while.body

while.end: ; preds = %while.body, %entry
%.lcssa = phi i32 [ %t2, %entry ], [ %add3, %while.body ]
ret i32 %.lcssa
}

...
---
# Check A = B and B = A copies will not exist in the loop at the same time.
# CHECK: name: foo
# CHECK: [[L1:bb.3.while.body]]:
# CHECK: %[[REGA:.*]] = COPY %[[REGB:.*]]
# CHECK-NOT: %[[REGB]] = COPY %[[REGA]]
# CHECK: JNE_1 %[[L1]]

name: foo
alignment: 4
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: gr64 }
- { id: 1, class: gr8 }
- { id: 2, class: gr32 }
- { id: 3, class: gr32 }
- { id: 4, class: gr8 }
- { id: 5, class: gr32 }
- { id: 6, class: gr8 }
- { id: 7, class: gr32 }
- { id: 8, class: gr32 }
- { id: 9, class: gr32 }
- { id: 10, class: gr32 }
- { id: 11, class: gr32 }
- { id: 12, class: gr8 }
- { id: 13, class: gr32 }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 0
adjustsStack: false
hasCalls: false
maxCallFrameSize: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
bb.0.entry:
successors: %bb.4(0x30000000), %bb.1.while.body.preheader(0x50000000)
%0 = MOV64rm %rip, 1, _, @b, _ :: (dereferenceable load 8 from @b)
%12 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.t0)
TEST8rr %12, %12, implicit-def %eflags
%11 = MOV32rm %rip, 1, _, @a, _ :: (dereferenceable load 4 from @a)
JNE_1 %bb.1.while.body.preheader, implicit killed %eflags
bb.4:
successors: %bb.3.while.end(0x80000000)
%10 = COPY %11
JMP_1 %bb.3.while.end
bb.1.while.body.preheader:
successors: %bb.2.while.body(0x80000000)
bb.2.while.body:
successors: %bb.3.while.end(0x04000000), %bb.2.while.body(0x7c000000)
%8 = MOVSX32rr8 %12
%10 = COPY %11
%10 = SHL32ri %10, 5, implicit-def dead %eflags
%10 = ADD32rr %10, %11, implicit-def dead %eflags
%10 = ADD32rr %10, %8, implicit-def dead %eflags
MOV32mr %rip, 1, _, @a, _, %10 :: (store 4 into @a)
%12 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.t0)
TEST8rr %12, %12, implicit-def %eflags
%11 = COPY %10
JNE_1 %bb.2.while.body, implicit killed %eflags
JMP_1 %bb.3.while.end
bb.3.while.end:
%eax = COPY %10
RET 0, killed %eax
...

0 comments on commit caab16e

Please sign in to comment.