[ARM] and, or, xor and add with shl combine

The generic dag combiner will fold: (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) This can create constants which are too large to use as an immediate. Many ALU operations are also able of performing the shl, so we can unfold the transformation to prevent a mov imm instruction from being generated. Other patterns, such as b + ((a << 1) | 510), can also be simplified in the same manner. Differential Revision: https://reviews.llvm.org/D38084 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317197 91177308-0d34-0410-b5e6-96231b3b80d8
emankov · Nov 2, 2017 · b7c0518 · b7c0518
1 parent 4746ebd
commit b7c0518
Show file tree

Hide file tree

Showing 2 changed files with 293 additions and 7 deletions.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
@@ -9955,6 +9955,102 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue PerformSHLSimplify(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *ST) {
+  // Allow the generic combiner to identify potential bswaps.
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  // DAG combiner will fold:
+  // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
+  // Other code patterns that can be also be modified have the following form:
+  // b + ((a << 1) | 510)
+  // b + ((a << 1) & 510)
+  // b + ((a << 1) ^ 510)
+  // b + ((a << 1) + 510)
+
+  // Many instructions can  perform the shift for free, but it requires both
+  // the operands to be registers. If c1 << c2 is too large, a mov immediate
+  // instruction will needed. So, unfold back to the original pattern if:
+  // - if c1 and c2 are small enough that they don't require mov imms.
+  // - the user(s) of the node can perform an shl
+
+  // No shifted operands for 16-bit instructions.
+  if (ST->isThumb() && ST->isThumb1Only())
+    return SDValue();
+
+  // Check that all the users could perform the shl themselves.
+  for (auto U : N->uses()) {
+    switch(U->getOpcode()) {
+    default:
+      return SDValue();
+    case ISD::SUB:
+    case ISD::ADD:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::SETCC:
+    case ARMISD::CMP:
+      // Check that its not already using a shl.
+      if (U->getOperand(0).getOpcode() == ISD::SHL ||
+          U->getOperand(1).getOpcode() == ISD::SHL)
+        return SDValue();
+      break;
+    }
+  }
+
+  if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
+      N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::SHL)
+    return SDValue();
+
+  SDValue SHL = N->getOperand(0);
+
+  auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!C1ShlC2 || !C2)
+    return SDValue();
+
+  DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());
+
+  APInt C2Int = C2->getAPIntValue();
+  APInt C1Int = C1ShlC2->getAPIntValue();
+
+  // Check that performing a lshr will not lose any information.
+  APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
+                                     C2Int.getBitWidth() - C2->getZExtValue());
+  if ((C1Int & Mask) != C1Int)
+    return SDValue();
+
+  // Shift the first constant.
+  C1Int.lshrInPlace(C2Int);
+
+  // The immediates are encoded as an 8-bit value that can be rotated.
+  unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
+  if (C1Int.getBitWidth() - Zeros > 8)
+    return SDValue();
+
+  Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
+  if (C2Int.getBitWidth() - Zeros > 8)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue X = SHL.getOperand(0);
+  SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
+                              DAG.getConstant(C1Int, dl, MVT::i32));
+  // Shift left to compensate for the lshr of C1Int.
+  SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
+
+  DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
+  return SDValue(N, 0);
+}
+
+
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
@@ -9963,6 +10059,10 @@ static SDValue PerformADDCombine(SDNode *N,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Only works one way, because it needs an immediate operand.
+  if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+    return Result;
+
   // First try with the default operand order.
   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
@@ -10151,6 +10251,9 @@ static SDValue PerformANDCombine(SDNode *N,
     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
       return Result;
+
+    if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+      return Result;
   }
 
   return SDValue();
@@ -10384,17 +10487,19 @@ static SDValue PerformORCombine(SDNode *N,
       return Result;
   }
 
-  // The code below optimizes (or (and X, Y), Z).
-  // The AND operand needs to have a single user to make these optimizations
-  // profitable.
   SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
-    return SDValue();
   SDValue N1 = N->getOperand(1);
 
   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+
+    // The code below optimizes (or (and X, Y), Z).
+    // The AND operand needs to have a single user to make these optimizations
+    // profitable.
+    if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+      return SDValue();
+
     APInt SplatUndef;
     unsigned SplatBitSize;
     bool HasAnyUndefs;
@@ -10427,8 +10532,13 @@ static SDValue PerformORCombine(SDNode *N,
 
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
-  if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
-    return Res;
+  if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+    if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
+      return Res;
+  }
+
+  if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+    return Result;
 
   return SDValue();
 }
@@ -10446,6 +10556,9 @@ static SDValue PerformXORCombine(SDNode *N,
     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
+
+    if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
+      return Result;
   }
 
   return SDValue();

diff --git a/test/CodeGen/ARM/unfold-shifts.ll b/test/CodeGen/ARM/unfold-shifts.ll
@@ -0,0 +1,173 @@
+; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2
+
+; CHECK-LABEL: unfold1
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #255
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #255
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 1
+  %shl = or i32 %or, 510
+  %add = add nsw i32 %shl, %b
+  ret i32 %add
+}
+
+; CHECK-LABEL: unfold2
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #4080
+; CHECK: sub r0, r1, r0, lsl #2
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #4080
+; CHECK-T2: sub.w r0, r1, r0, lsl #2
+define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 2
+  %shl = or i32 %or, 16320
+  %sub = sub nsw i32 %b, %shl
+  ret i32 %sub
+}
+
+; CHECK-LABEL: unfold3
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #65280
+; CHECK: and r0, r1, r0, lsl #4
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #65280
+; CHECK-T2: and.w r0, r1, r0, lsl #4
+define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 4
+  %shl = or i32 %or, 1044480
+  %and = and i32 %shl, %b
+  ret i32 %and
+}
+
+; CHECK-LABEL: unfold4
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #1044480
+; CHECK: eor r0, r1, r0, lsl #5
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #1044480
+; CHECK-T2: eor.w r0, r1, r0, lsl #5
+define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 5
+  %shl = or i32 %or, 33423360
+  %xor = xor i32 %shl, %b
+  ret i32 %xor
+}
+
+; CHECK-LABEL: unfold5
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #496
+; CHECK: orr r0, r1, r0, lsl #6
+; CHECK-T2: add.w r0, r0, #496
+; CHECK-T2: orr.w r0, r1, r0, lsl #6
+define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 6
+  %shl = add i32 %add, 31744
+  %or = or i32 %shl, %b
+  ret i32 %or
+}
+
+; CHECK-LABEL: unfold6
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #7936
+; CHECK: and r0, r1, r0, lsl #8
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #7936
+; CHECK-T2: and.w r0, r1, r0, lsl #8
+define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 8
+  %shl = add i32 %add, 2031616
+  %and = and i32 %shl, %b
+  ret i32 %and
+}
+
+; CHECK-LABEL: unfold7
+; CHECK-NOT: mov
+; CHECK: and r0, r0, #256
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: and r0, r0, #256
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
+entry:
+  %shl = shl i32 %a, 1
+  %and = and i32 %shl, 512
+  %add = add nsw i32 %and, %b
+  ret i32 %add
+}
+
+; CHECK-LABEL: unfold8
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #126976
+; CHECK: eor r0, r1, r0, lsl #9
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #126976
+; CHECK-T2: eor.w r0, r1, r0, lsl #9
+define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 9
+  %shl = add i32 %add, 65011712
+  %xor = xor i32 %shl, %b
+  ret i32 %xor
+}
+
+; CHECK-LABEL: unfold9
+; CHECK-NOT: mov
+; CHECK: eor r0, r0, #255
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: eor r0, r0, #255
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
+entry:
+  %shl = shl i32 %a, 1
+  %xor = xor i32 %shl, 510
+  %add = add nsw i32 %xor, %b
+  ret i32 %add
+}
+
+; CHECK-LABEL: unfold10
+; CHECK-NOT: mov r2
+; CHECK: orr r2, r0, #4080
+; CHECK: cmp r1, r2, lsl #10
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: orr r2, r0, #4080
+; CHECK-T2: cmp.w r1, r2, lsl #10
+define arm_aapcscc i32 @unfold10(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 10
+  %shl = or i32 %or, 4177920
+  %cmp = icmp sgt i32 %shl, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: unfold11
+; CHECK-NOT: mov r2
+; CHECK: add r2, r0, #7936
+; CHECK: cmp r1, r2, lsl #11
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: add.w r2, r0, #7936
+; CHECK-T2: cmp.w r1, r2, lsl #11
+define arm_aapcscc i32 @unfold11(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 11
+  %shl = add i32 %add, 16252928
+  %cmp = icmp sgt i32 %shl, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+