[SystemZ] Handle sub-128 vectors

The ABI allows sub-128 vectors to be passed and returned in registers, with the vector occupying the upper part of a register. We therefore want to legalize those types by widening the vector rather than promoting the elements. The patch includes some simple tests for sub-128 vectors and also tests that we can recognize various pack sequences, some of which use sub-128 vectors as temporary results. One of these forms is based on the pack sequences generated by llvmpipe when no intrinsics are used. Signed unpacks are recognized as BUILD_VECTORs whose elements are individually sign-extended. Unsigned unpacks can have the equivalent form with zero extension, but they also occur as shuffles in which some elements are zero. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236525 91177308-0d34-0410-b5e6-96231b3b80d8
ROCm · May 5, 2015 · 538287d · 538287d
1 parent cf0fa9b
commit 538287d
Show file tree

Hide file tree

Showing 20 changed files with 1,175 additions and 29 deletions.
diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
@@ -28,6 +28,14 @@ class SystemZCCState : public CCState {
   /// See ISD::OutputArg::IsFixed.
   SmallVector<bool, 4> ArgIsFixed;
 
+  /// Records whether the value was widened from a short vector type.
+  SmallVector<bool, 4> ArgIsShortVector;
+
+  // Check whether ArgVT is a short vector type.
+  bool IsShortVectorType(EVT ArgVT) {
+    return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
+  }
+
 public:
   SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
@@ -39,6 +47,10 @@ class SystemZCCState : public CCState {
     ArgIsFixed.clear();
     for (unsigned i = 0; i < Ins.size(); ++i)
       ArgIsFixed.push_back(true);
+    // Record whether the call operand was a short vector.
+    ArgIsShortVector.clear();
+    for (unsigned i = 0; i < Ins.size(); ++i)
+      ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
 
     CCState::AnalyzeFormalArguments(Ins, Fn);
   }
@@ -49,6 +61,10 @@ class SystemZCCState : public CCState {
     ArgIsFixed.clear();
     for (unsigned i = 0; i < Outs.size(); ++i)
       ArgIsFixed.push_back(Outs[i].IsFixed);
+    // Record whether the call operand was a short vector.
+    ArgIsShortVector.clear();
+    for (unsigned i = 0; i < Outs.size(); ++i)
+      ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
 
     CCState::AnalyzeCallOperands(Outs, Fn);
   }
@@ -60,6 +76,7 @@ class SystemZCCState : public CCState {
                            CCAssignFn Fn) = delete;
 
   bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
+  bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
 };
 
 } // end namespace llvm

diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A>
 class CCIfFixed<CCAction A>
     : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
 
+// Match if this specific argument was widened from a short vector type.
+class CCIfShortVector<CCAction A>
+    : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
+
+
 //===----------------------------------------------------------------------===//
 // z/Linux return value calling convention
 //===----------------------------------------------------------------------===//
@@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 
   // Similarly for vectors, with V24 being the ABI-compliant choice.
+  // Sub-128 vectors are returned in the same way, but they're widened
+  // to one of these types during type legalization.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
@@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[
   CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
   CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
 
-  // The first 8 named vector arguments are passed in V24-V31.
+  // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
+  // are passed in the same way, but they're widened to one of these types
+  // during type legalization.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
                                       V25, V27, V29, V31]>>>>,
 
+  // However, sub-128 vectors which need to go on the stack occupy just a
+  // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCIfShortVector<CCBitConvertToType<i64>>>>,
+
   // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
       // Convert a GPR scalar to a vector by inserting it into element 0.
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 
+      // Use a series of unpacks for extensions.
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+
       // Detect shifts by a scalar amount and convert them into
       // V*_BY_SCALAR.
       setOperationAction(ISD::SHL, VT, Custom);
@@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
   else if (VA.getLocInfo() == CCValAssign::Indirect)
     Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
                         MachinePointerInfo(), false, false, false, 0);
-  else
+  else if (VA.getLocInfo() == CCValAssign::BCvt) {
+    // If this is a short vector argument loaded from the stack,
+    // extend from i64 to full vector size and then bitcast.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
+                        Value, DAG.getUNDEF(MVT::i64));
+    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
+  } else
     assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
   return Value;
 }
@@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
   case CCValAssign::AExt:
     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::BCvt:
+    // If this is a short vector argument to be stored to the stack,
+    // bitcast to v2i64 and then extract first element.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+                       DAG.getConstant(0, DL, MVT::i32));
   case CCValAssign::Full:
     return Value;
   default:
@@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
+SDValue
+SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+					      unsigned UnpackHigh) const {
+  SDValue PackedOp = Op.getOperand(0);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
+  unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
+  do {
+    FromBits *= 2;
+    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+                                 SystemZ::VectorBits / FromBits);
+    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+  } while (FromBits != ToBits);
+  return PackedOp;
+}
+
 SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
                                           unsigned ByScalar) const {
   // Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
   case ISD::SHL:
     return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
   case ISD::SRL:
@@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(PERMUTE_DWORDS);
     OPCODE(PERMUTE);
     OPCODE(PACK);
+    OPCODE(UNPACK_HIGH);
+    OPCODE(UNPACKL_HIGH);
+    OPCODE(UNPACK_LOW);
+    OPCODE(UNPACKL_LOW);
     OPCODE(VSHL_BY_SCALAR);
     OPCODE(VSRL_BY_SCALAR);
     OPCODE(VSRA_BY_SCALAR);
@@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
   }
-  // (z_merge_high 0, 0) -> 0.  This is mostly useful for using VLLEZF
-  // for v4f32.
-  if (Opcode == SystemZISD::MERGE_HIGH) {
+  if (Opcode == SystemZISD::MERGE_HIGH ||
+      Opcode == SystemZISD::MERGE_LOW) {
     SDValue Op0 = N->getOperand(0);
     SDValue Op1 = N->getOperand(1);
-    if (Op0 == Op1) {
-      if (Op0.getOpcode() == ISD::BITCAST)
-        Op0 = Op0.getOperand(0);
-      if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
-          cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
+    if (Op0.getOpcode() == ISD::BITCAST)
+      Op0 = Op0.getOperand(0);
+    if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+        cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+      // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
+      // for v4f32.
+      if (Op1 == N->getOperand(0))
         return Op1;
+      // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
+      EVT VT = Op1.getValueType();
+      unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
+      if (ElemBytes <= 4) {
+        Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
+                  SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
+        EVT InVT = VT.changeVectorElementTypeToInteger();
+        EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
+                                     SystemZ::VectorBytes / ElemBytes / 2);
+        if (VT != InVT) {
+          Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
+          DCI.AddToWorklist(Op1.getNode());
+        }
+        SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
+        DCI.AddToWorklist(Op.getNode());
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+      }
     }
   }
   // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better

diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -201,6 +201,15 @@ enum {
   // Pack vector operands 0 and 1 into a single vector with half-sized elements.
   PACK,
 
+  // Unpack the first half of vector operand 0 into double-sized elements.
+  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
+  UNPACK_HIGH,
+  UNPACKL_HIGH,
+
+  // Likewise for the second half.
+  UNPACK_LOW,
+  UNPACKL_LOW,
+
   // Shift each element of vector operand 0 by the number of bits specified
   // by scalar operand 1.
   VSHL_BY_SCALAR,
@@ -306,6 +315,23 @@ class SystemZTargetLowering : public TargetLowering {
     // want to clobber the upper 32 bits of a GPR unnecessarily.
     return MVT::i32;
   }
+  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    const override {
+    // Widen subvectors to the full width rather than promoting integer
+    // elements.  This is better because:
+    //
+    // (a) it means that we can handle the ABI for passing and returning
+    //     sub-128 vectors without having to handle them as legal types.
+    //
+    // (b) we don't have instructions to extend on load and truncate on store,
+    //     so promoting the integers is less efficient.
+    //
+    // (c) there are no multiplication instructions for the widest integer
+    //     type (v2i64).
+    if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
+      return TypeWidenVector;
+    return TargetLoweringBase::getPreferredVectorAction(VT);
+  }
   EVT getSetCCResultType(LLVMContext &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
@@ -417,6 +443,8 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+				 unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
   SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,

diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in {
   def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
 
   // Unpack high.
-  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>;
-  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>;
-  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>;
+  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
+  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
+  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
 
   // Unpack logical high.
-  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>;
-  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>;
-  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>;
+  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
+  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
+  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
 
   // Unpack low.
-  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, null_frag, v128h, v128b, 0>;
-  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>;
-  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, null_frag, v128g, v128f, 2>;
+  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, z_unpack_low, v128h, v128b, 0>;
+  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
+  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, z_unpack_low, v128g, v128f, 2>;
 
   // Unpack logical low.
-  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>;
-  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>;
-  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>;
+  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
+  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
+  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
@@ -193,6 +193,10 @@ def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
                                  SDT_ZVecTernaryInt>;
 def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
+def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
+def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
+def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
 def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr),
 def z_vllezf32 : PatFrag<(ops node:$addr),
                          (bitconvert
                           (z_merge_high
-                           (v2i64 (bitconvert
-                                   (z_merge_high
-                                    (v4f32 (z_vzero)),
-                                    (v4f32 (scalar_to_vector
-                                            (f32 (load node:$addr))))))),
+                           (v2i64
+                            (z_unpackl_high
+                             (v4i32
+                              (bitconvert
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))))))),
                            (v2i64 (z_vzero))))>;
 def z_vllezf64 : PatFrag<(ops node:$addr),
                          (z_merge_high

diff --git a/test/CodeGen/SystemZ/vec-args-03.ll b/test/CodeGen/SystemZ/vec-args-03.ll
@@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4
   %y = sub <4 x i32> %v2, %v10
   ret <4 x i32> %y
 }
+
+; This routine has 10 vector arguments, which fill up %v24-%v31 and
+; the two single-wide stack slots at 160 and 168.
+define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
+                     <4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
+                     <4 x i8> %v9, <4 x i8> %v10) {
+; CHECK-LABEL: bar:
+; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
+; CHECK: vsb %v24, %v26, [[REG1]]
+; CHECK: br %r14
+  %y = sub <4 x i8> %v2, %v10
+  ret <4 x i8> %y
+}
+