Skip to content

Commit

Permalink
[SystemZ] Handle sub-128 vectors
Browse files Browse the repository at this point in the history
The ABI allows sub-128 vectors to be passed and returned in registers,
with the vector occupying the upper part of a register.  We therefore
want to legalize those types by widening the vector rather than promoting
the elements.

The patch includes some simple tests for sub-128 vectors and also tests
that we can recognize various pack sequences, some of which use sub-128
vectors as temporary results.  One of these forms is based on the pack
sequences generated by llvmpipe when no intrinsics are used.

Signed unpacks are recognized as BUILD_VECTORs whose elements are
individually sign-extended.  Unsigned unpacks can have the equivalent
form with zero extension, but they also occur as shuffles in which some
elements are zero.

Based on a patch by Richard Sandiford.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236525 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
uweigand committed May 5, 2015
1 parent cf0fa9b commit 538287d
Show file tree
Hide file tree
Showing 20 changed files with 1,175 additions and 29 deletions.
17 changes: 17 additions & 0 deletions lib/Target/SystemZ/SystemZCallingConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ class SystemZCCState : public CCState {
/// See ISD::OutputArg::IsFixed.
SmallVector<bool, 4> ArgIsFixed;

/// Records whether the value was widened from a short vector type.
SmallVector<bool, 4> ArgIsShortVector;

// Check whether ArgVT is a short vector type.
bool IsShortVectorType(EVT ArgVT) {
return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
}

public:
SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
Expand All @@ -39,6 +47,10 @@ class SystemZCCState : public CCState {
ArgIsFixed.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
ArgIsFixed.push_back(true);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Ins.size(); ++i)
ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));

CCState::AnalyzeFormalArguments(Ins, Fn);
}
Expand All @@ -49,6 +61,10 @@ class SystemZCCState : public CCState {
ArgIsFixed.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
ArgIsFixed.push_back(Outs[i].IsFixed);
// Record whether the call operand was a short vector.
ArgIsShortVector.clear();
for (unsigned i = 0; i < Outs.size(); ++i)
ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));

CCState::AnalyzeCallOperands(Outs, Fn);
}
Expand All @@ -60,6 +76,7 @@ class SystemZCCState : public CCState {
CCAssignFn Fn) = delete;

bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
};

} // end namespace llvm
Expand Down
17 changes: 16 additions & 1 deletion lib/Target/SystemZ/SystemZCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ class CCIfSubtarget<string F, CCAction A>
class CCIfFixed<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;

// Match if this specific argument was widened from a short vector type.
class CCIfShortVector<CCAction A>
: CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;


//===----------------------------------------------------------------------===//
// z/Linux return value calling convention
//===----------------------------------------------------------------------===//
Expand All @@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,

// Similarly for vectors, with V24 being the ABI-compliant choice.
// Sub-128 vectors are returned in the same way, but they're widened
// to one of these types during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
Expand Down Expand Up @@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[
CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,

// The first 8 named vector arguments are passed in V24-V31.
// The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
// are passed in the same way, but they're widened to one of these types
// during type legalization.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
V25, V27, V29, V31]>>>>,

// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCIfShortVector<CCBitConvertToType<i64>>>>,

// Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
CCIfSubtarget<"hasVector()",
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
Expand Down
81 changes: 72 additions & 9 deletions lib/Target/SystemZ/SystemZISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
// Convert a GPR scalar to a vector by inserting it into element 0.
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

// Use a series of unpacks for extensions.
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

// Detect shifts by a scalar amount and convert them into
// V*_BY_SCALAR.
setOperationAction(ISD::SHL, VT, Custom);
Expand Down Expand Up @@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
else if (VA.getLocInfo() == CCValAssign::Indirect)
Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
MachinePointerInfo(), false, false, false, 0);
else
else if (VA.getLocInfo() == CCValAssign::BCvt) {
// If this is a short vector argument loaded from the stack,
// extend from i64 to full vector size and then bitcast.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
Value, DAG.getUNDEF(MVT::i64));
Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
} else
assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
return Value;
}
Expand All @@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::AExt:
return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::BCvt:
// If this is a short vector argument to be stored to the stack,
// bitcast to v2i64 and then extract first element.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
DAG.getConstant(0, DL, MVT::i32));
case CCValAssign::Full:
return Value;
default:
Expand Down Expand Up @@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}

SDValue
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
do {
FromBits *= 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
SystemZ::VectorBits / FromBits);
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
} while (FromBits != ToBits);
return PackedOp;
}

SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
unsigned ByScalar) const {
// Look for cases where a vector shift can use the *_BY_SCALAR form.
Expand Down Expand Up @@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
case ISD::ZERO_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
case ISD::SHL:
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
case ISD::SRL:
Expand Down Expand Up @@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(PERMUTE_DWORDS);
OPCODE(PERMUTE);
OPCODE(PACK);
OPCODE(UNPACK_HIGH);
OPCODE(UNPACKL_HIGH);
OPCODE(UNPACK_LOW);
OPCODE(UNPACKL_LOW);
OPCODE(VSHL_BY_SCALAR);
OPCODE(VSRL_BY_SCALAR);
OPCODE(VSRA_BY_SCALAR);
Expand Down Expand Up @@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
// (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Opcode == SystemZISD::MERGE_HIGH) {
if (Opcode == SystemZISD::MERGE_HIGH ||
Opcode == SystemZISD::MERGE_LOW) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0 == Op1) {
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Op1 == N->getOperand(0))
return Op1;
// (z_merge_? 0, X) -> (z_unpackl_? 0, X).
EVT VT = Op1.getValueType();
unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
if (ElemBytes <= 4) {
Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
EVT InVT = VT.changeVectorElementTypeToInteger();
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
SystemZ::VectorBytes / ElemBytes / 2);
if (VT != InVT) {
Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
DCI.AddToWorklist(Op1.getNode());
}
SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
DCI.AddToWorklist(Op.getNode());
return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
}
}
// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
Expand Down
28 changes: 28 additions & 0 deletions lib/Target/SystemZ/SystemZISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,15 @@ enum {
// Pack vector operands 0 and 1 into a single vector with half-sized elements.
PACK,

// Unpack the first half of vector operand 0 into double-sized elements.
// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
UNPACK_HIGH,
UNPACKL_HIGH,

// Likewise for the second half.
UNPACK_LOW,
UNPACKL_LOW,

// Shift each element of vector operand 0 by the number of bits specified
// by scalar operand 1.
VSHL_BY_SCALAR,
Expand Down Expand Up @@ -306,6 +315,23 @@ class SystemZTargetLowering : public TargetLowering {
// want to clobber the upper 32 bits of a GPR unnecessarily.
return MVT::i32;
}
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
const override {
// Widen subvectors to the full width rather than promoting integer
// elements. This is better because:
//
// (a) it means that we can handle the ABI for passing and returning
// sub-128 vectors without having to handle them as legal types.
//
// (b) we don't have instructions to extend on load and truncate on store,
// so promoting the integers is less efficient.
//
// (c) there are no multiplication instructions for the widest integer
// type (v2i64).
if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
EVT getSetCCResultType(LLVMContext &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
Expand Down Expand Up @@ -417,6 +443,8 @@ class SystemZTargetLowering : public TargetLowering {
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;

SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
Expand Down
24 changes: 12 additions & 12 deletions lib/Target/SystemZ/SystemZInstrVector.td
Original file line number Diff line number Diff line change
Expand Up @@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in {
def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;

// Unpack high.
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>;
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>;
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>;
def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;

// Unpack logical high.
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>;
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>;
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>;
def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;

// Unpack low.
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, null_frag, v128h, v128b, 0>;
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>;
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, null_frag, v128g, v128f, 2>;
def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>;
def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>;

// Unpack logical low.
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>;
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>;
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>;
def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
}

//===----------------------------------------------------------------------===//
Expand Down
15 changes: 10 additions & 5 deletions lib/Target/SystemZ/SystemZOperators.td
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,10 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
SDT_ZVecTernaryInt>;
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR",
SDT_ZVecBinaryInt>;
def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR",
Expand Down Expand Up @@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr),
def z_vllezf32 : PatFrag<(ops node:$addr),
(bitconvert
(z_merge_high
(v2i64 (bitconvert
(z_merge_high
(v4f32 (z_vzero)),
(v4f32 (scalar_to_vector
(f32 (load node:$addr))))))),
(v2i64
(z_unpackl_high
(v4i32
(bitconvert
(v4f32 (scalar_to_vector
(f32 (load node:$addr)))))))),
(v2i64 (z_vzero))))>;
def z_vllezf64 : PatFrag<(ops node:$addr),
(z_merge_high
Expand Down
14 changes: 14 additions & 0 deletions test/CodeGen/SystemZ/vec-args-03.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4
%y = sub <4 x i32> %v2, %v10
ret <4 x i32> %y
}

; This routine has 10 vector arguments, which fill up %v24-%v31 and
; the two single-wide stack slots at 160 and 168.
define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
<4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
<4 x i8> %v9, <4 x i8> %v10) {
; CHECK-LABEL: bar:
; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
; CHECK: vsb %v24, %v26, [[REG1]]
; CHECK: br %r14
%y = sub <4 x i8> %v2, %v10
ret <4 x i8> %y
}

Loading

0 comments on commit 538287d

Please sign in to comment.