Skip to content

Commit

Permalink
[ARM] and, or, xor and add with shl combine
Browse files Browse the repository at this point in the history
The generic dag combiner will fold:

(shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
(shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)

This can create constants which are too large to use as an immediate.
Many ALU operations are also able of performing the shl, so we can
unfold the transformation to prevent a mov imm instruction from being
generated.

Other patterns, such as b + ((a << 1) | 510), can also be simplified
in the same manner.

Differential Revision: https://reviews.llvm.org/D38084


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317197 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
sparker-arm committed Nov 2, 2017
1 parent 4746ebd commit b7c0518
Show file tree
Hide file tree
Showing 2 changed files with 293 additions and 7 deletions.
127 changes: 120 additions & 7 deletions lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9955,6 +9955,102 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}

static SDValue PerformSHLSimplify(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
// Allow the generic combiner to identify potential bswaps.
if (DCI.isBeforeLegalize())
return SDValue();

// DAG combiner will fold:
// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
// Other code patterns that can be also be modified have the following form:
// b + ((a << 1) | 510)
// b + ((a << 1) & 510)
// b + ((a << 1) ^ 510)
// b + ((a << 1) + 510)

// Many instructions can perform the shift for free, but it requires both
// the operands to be registers. If c1 << c2 is too large, a mov immediate
// instruction will needed. So, unfold back to the original pattern if:
// - if c1 and c2 are small enough that they don't require mov imms.
// - the user(s) of the node can perform an shl

// No shifted operands for 16-bit instructions.
if (ST->isThumb() && ST->isThumb1Only())
return SDValue();

// Check that all the users could perform the shl themselves.
for (auto U : N->uses()) {
switch(U->getOpcode()) {
default:
return SDValue();
case ISD::SUB:
case ISD::ADD:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::SETCC:
case ARMISD::CMP:
// Check that its not already using a shl.
if (U->getOperand(0).getOpcode() == ISD::SHL ||
U->getOperand(1).getOpcode() == ISD::SHL)
return SDValue();
break;
}
}

if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
return SDValue();

if (N->getOperand(0).getOpcode() != ISD::SHL)
return SDValue();

SDValue SHL = N->getOperand(0);

auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
if (!C1ShlC2 || !C2)
return SDValue();

DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());

APInt C2Int = C2->getAPIntValue();
APInt C1Int = C1ShlC2->getAPIntValue();

// Check that performing a lshr will not lose any information.
APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
C2Int.getBitWidth() - C2->getZExtValue());
if ((C1Int & Mask) != C1Int)
return SDValue();

// Shift the first constant.
C1Int.lshrInPlace(C2Int);

// The immediates are encoded as an 8-bit value that can be rotated.
unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
if (C1Int.getBitWidth() - Zeros > 8)
return SDValue();

Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
if (C2Int.getBitWidth() - Zeros > 8)
return SDValue();

SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue X = SHL.getOperand(0);
SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
DAG.getConstant(C1Int, dl, MVT::i32));
// Shift left to compensate for the lshr of C1Int.
SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));

DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
return SDValue(N, 0);
}


/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
Expand All @@ -9963,6 +10059,10 @@ static SDValue PerformADDCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

// Only works one way, because it needs an immediate operand.
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;

// First try with the default operand order.
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
return Result;
Expand Down Expand Up @@ -10151,6 +10251,9 @@ static SDValue PerformANDCombine(SDNode *N,
// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
return Result;

if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
}

return SDValue();
Expand Down Expand Up @@ -10384,17 +10487,19 @@ static SDValue PerformORCombine(SDNode *N,
return Result;
}

// The code below optimizes (or (and X, Y), Z).
// The AND operand needs to have a single user to make these optimizations
// profitable.
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();
SDValue N1 = N->getOperand(1);

// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

// The code below optimizes (or (and X, Y), Z).
// The AND operand needs to have a single user to make these optimizations
// profitable.
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();

APInt SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
Expand Down Expand Up @@ -10427,8 +10532,13 @@ static SDValue PerformORCombine(SDNode *N,

// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
return Res;
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
return Res;
}

if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;

return SDValue();
}
Expand All @@ -10446,6 +10556,9 @@ static SDValue PerformXORCombine(SDNode *N,
// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
return Result;

if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
}

return SDValue();
Expand Down
173 changes: 173 additions & 0 deletions test/CodeGen/ARM/unfold-shifts.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s
; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2
; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2
; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2
; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2

; CHECK-LABEL: unfold1
; CHECK-NOT: mov
; CHECK: orr r0, r0, #255
; CHECK: add r0, r1, r0, lsl #1
; CHECK-T2-NOT: mov
; CHECK-T2: orr r0, r0, #255
; CHECK-T2: add.w r0, r1, r0, lsl #1
define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
entry:
%or = shl i32 %a, 1
%shl = or i32 %or, 510
%add = add nsw i32 %shl, %b
ret i32 %add
}

; CHECK-LABEL: unfold2
; CHECK-NOT: mov
; CHECK: orr r0, r0, #4080
; CHECK: sub r0, r1, r0, lsl #2
; CHECK-T2-NOT: mov
; CHECK-T2: orr r0, r0, #4080
; CHECK-T2: sub.w r0, r1, r0, lsl #2
define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
entry:
%or = shl i32 %a, 2
%shl = or i32 %or, 16320
%sub = sub nsw i32 %b, %shl
ret i32 %sub
}

; CHECK-LABEL: unfold3
; CHECK-NOT: mov
; CHECK: orr r0, r0, #65280
; CHECK: and r0, r1, r0, lsl #4
; CHECK-T2-NOT: mov
; CHECK-T2: orr r0, r0, #65280
; CHECK-T2: and.w r0, r1, r0, lsl #4
define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
entry:
%or = shl i32 %a, 4
%shl = or i32 %or, 1044480
%and = and i32 %shl, %b
ret i32 %and
}

; CHECK-LABEL: unfold4
; CHECK-NOT: mov
; CHECK: orr r0, r0, #1044480
; CHECK: eor r0, r1, r0, lsl #5
; CHECK-T2-NOT: mov
; CHECK-T2: orr r0, r0, #1044480
; CHECK-T2: eor.w r0, r1, r0, lsl #5
define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
entry:
%or = shl i32 %a, 5
%shl = or i32 %or, 33423360
%xor = xor i32 %shl, %b
ret i32 %xor
}

; CHECK-LABEL: unfold5
; CHECK-NOT: mov
; CHECK: add r0, r0, #496
; CHECK: orr r0, r1, r0, lsl #6
; CHECK-T2: add.w r0, r0, #496
; CHECK-T2: orr.w r0, r1, r0, lsl #6
define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
entry:
%add = shl i32 %a, 6
%shl = add i32 %add, 31744
%or = or i32 %shl, %b
ret i32 %or
}

; CHECK-LABEL: unfold6
; CHECK-NOT: mov
; CHECK: add r0, r0, #7936
; CHECK: and r0, r1, r0, lsl #8
; CHECK-T2-NOT: mov
; CHECK-T2: add.w r0, r0, #7936
; CHECK-T2: and.w r0, r1, r0, lsl #8
define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
entry:
%add = shl i32 %a, 8
%shl = add i32 %add, 2031616
%and = and i32 %shl, %b
ret i32 %and
}

; CHECK-LABEL: unfold7
; CHECK-NOT: mov
; CHECK: and r0, r0, #256
; CHECK: add r0, r1, r0, lsl #1
; CHECK-T2-NOT: mov
; CHECK-T2: and r0, r0, #256
; CHECK-T2: add.w r0, r1, r0, lsl #1
define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
entry:
%shl = shl i32 %a, 1
%and = and i32 %shl, 512
%add = add nsw i32 %and, %b
ret i32 %add
}

; CHECK-LABEL: unfold8
; CHECK-NOT: mov
; CHECK: add r0, r0, #126976
; CHECK: eor r0, r1, r0, lsl #9
; CHECK-T2-NOT: mov
; CHECK-T2: add.w r0, r0, #126976
; CHECK-T2: eor.w r0, r1, r0, lsl #9
define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
entry:
%add = shl i32 %a, 9
%shl = add i32 %add, 65011712
%xor = xor i32 %shl, %b
ret i32 %xor
}

; CHECK-LABEL: unfold9
; CHECK-NOT: mov
; CHECK: eor r0, r0, #255
; CHECK: add r0, r1, r0, lsl #1
; CHECK-T2-NOT: mov
; CHECK-T2: eor r0, r0, #255
; CHECK-T2: add.w r0, r1, r0, lsl #1
define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
entry:
%shl = shl i32 %a, 1
%xor = xor i32 %shl, 510
%add = add nsw i32 %xor, %b
ret i32 %add
}

; CHECK-LABEL: unfold10
; CHECK-NOT: mov r2
; CHECK: orr r2, r0, #4080
; CHECK: cmp r1, r2, lsl #10
; CHECK-T2-NOT: mov.w r2
; CHECK-T2: orr r2, r0, #4080
; CHECK-T2: cmp.w r1, r2, lsl #10
define arm_aapcscc i32 @unfold10(i32 %a, i32 %b) {
entry:
%or = shl i32 %a, 10
%shl = or i32 %or, 4177920
%cmp = icmp sgt i32 %shl, %b
%conv = zext i1 %cmp to i32
ret i32 %conv
}

; CHECK-LABEL: unfold11
; CHECK-NOT: mov r2
; CHECK: add r2, r0, #7936
; CHECK: cmp r1, r2, lsl #11
; CHECK-T2-NOT: mov.w r2
; CHECK-T2: add.w r2, r0, #7936
; CHECK-T2: cmp.w r1, r2, lsl #11
define arm_aapcscc i32 @unfold11(i32 %a, i32 %b) {
entry:
%add = shl i32 %a, 11
%shl = add i32 %add, 16252928
%cmp = icmp sgt i32 %shl, %b
%conv = zext i1 %cmp to i32
ret i32 %conv
}

0 comments on commit b7c0518

Please sign in to comment.