Skip to content

Commit

Permalink
Reverting VLD1/VST1 base-updating/post-incrementing combining
Browse files Browse the repository at this point in the history
This reverts patches 223862, 224198, 224203, and 224754, which were all
related to the vector load/store combining and were reverted/reaplied
a few times due to the same alignment problems we're seeing now.

Further tests, mainly self-hosting Clang, will be needed to reapply this
patch in the future.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228129 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
rengolin committed Feb 4, 2015
1 parent d16b9cd commit ff01f89
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 628 deletions.
116 changes: 14 additions & 102 deletions lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
setTargetDAGCombine(ISD::LOAD);

// It is legal to extload from v4i8 to v4i16 or v4i32.
MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
Expand Down Expand Up @@ -8868,18 +8867,17 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
DAG.getUNDEF(VT), NewMask.data());
}

/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
/// NEON load/store intrinsics, and generic vector load/stores, to merge
/// base address updates.
/// For generic load/stores, the memory type is assumed to be a vector.
/// The caller is assumed to have checked legality.
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
/// NEON load/store intrinsics to merge base address updates.
static SDValue CombineBaseUpdate(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();

SelectionDAG &DAG = DCI.DAG;
bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
bool isStore = N->getOpcode() == ISD::STORE;
unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
SDValue Addr = N->getOperand(AddrOpIdx);

// Search for a use of the address operand that is an increment.
Expand Down Expand Up @@ -8940,22 +8938,15 @@ static SDValue CombineBaseUpdate(SDNode *N,
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1; isLaneOp = false; break;
case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
NumVecs = 1; isLoad = false; isLaneOp = false; break;
}
}

// Find the size of memory referenced by the load/store.
EVT VecTy;
if (isLoad)
VecTy = N->getValueType(0);
else if (isIntrinsic)
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
else
VecTy = N->getOperand(1).getValueType();

VecTy = N->getOperand(AddrOpIdx+1).getValueType();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (isLaneOp)
NumBytes /= VecTy.getVectorNumElements();
Expand All @@ -8972,85 +8963,32 @@ static SDValue CombineBaseUpdate(SDNode *N,
continue;
}

EVT AlignedVecTy = VecTy;

// If this is a less-than-standard-aligned load/store, change the type to
// match the standard alignment.
// The alignment is overlooked when selecting _UPD variants; and it's
// easier to introduce bitcasts here than fix that.
// There are 3 ways to get to this base-update combine:
// - intrinsics: they are assumed to be properly aligned (to the standard
// alignment of the memory type), so we don't need to do anything.
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
// intrinsics, so, likewise, there's nothing to do.
// - generic load/store instructions: the alignment is specified as an
// explicit operand, rather than implicitly as the standard alignment
// of the memory type (like the intrisics). We need to change the
// memory type to match the explicit alignment. That way, we don't
// generate non-standard-aligned ARMISD::VLDx nodes.
if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(N)) {
unsigned Alignment = LSN->getAlignment();
if (Alignment == 0)
Alignment = 1;
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
assert(!isLaneOp && "Unexpected generic load/store lane.");
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
}
}

// Create the new updating load/store node.
// First, create an SDVTList for the new updating node's results.
EVT Tys[6];
unsigned NumResultVecs = (isLoad ? NumVecs : 0);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = AlignedVecTy;
Tys[n] = VecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));

// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // incoming chain
Ops.push_back(N->getOperand(AddrOpIdx));
Ops.push_back(Inc);
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
// Try to match the intrinsic's signature
Ops.push_back(StN->getValue());
Ops.push_back(DAG.getConstant(StN->getAlignment(), MVT::i32));
} else {
for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i)
Ops.push_back(N->getOperand(i));
}

// If this is a non-standard-aligned store, the penultimate operand is the
// stored value. Bitcast it to the aligned type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
SDValue &StVal = Ops[Ops.size()-2];
StVal = DAG.getNode(ISD::BITCAST, SDLoc(N), AlignedVecTy, StVal);
for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
Ops.push_back(N->getOperand(i));
}

MemSDNode *MemInt = cast<MemSDNode>(N);
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
Ops, AlignedVecTy,
Ops, MemInt->getMemoryVT(),
MemInt->getMemOperand());

// Update the uses.
std::vector<SDValue> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i) {
NewResults.push_back(SDValue(UpdN.getNode(), i));
}

// If this is an non-standard-aligned load, the first result is the loaded
// value. Bitcast it to the expected result type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
SDValue &LdVal = NewResults[0];
LdVal = DAG.getNode(ISD::BITCAST, SDLoc(N), VecTy, LdVal);
}

NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
Expand All @@ -9060,14 +8998,6 @@ static SDValue CombineBaseUpdate(SDNode *N,
return SDValue();
}

static SDValue PerformVLDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();

return CombineBaseUpdate(N, DCI);
}

/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
Expand Down Expand Up @@ -9181,18 +9111,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}

static SDValue PerformLOADCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);

// If this is a legal vector load, try to combine it into a VLD1_UPD.
if (ISD::isNormalLoad(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);

return SDValue();
}

/// PerformSTORECombine - Target-specific dag combine xforms for
/// ISD::STORE.
static SDValue PerformSTORECombine(SDNode *N,
Expand Down Expand Up @@ -9331,11 +9249,6 @@ static SDValue PerformSTORECombine(SDNode *N,
St->getAAInfo());
}

// If this is a legal vector store, try to combine it into a VST1_UPD.
if (ISD::isNormalStore(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);

return SDValue();
}

Expand Down Expand Up @@ -9929,11 +9842,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);
case ARMISD::VLD2DUP:
case ARMISD::VLD3DUP:
case ARMISD::VLD4DUP:
return PerformVLDCombine(N, DCI);
return CombineBaseUpdate(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
case ISD::INTRINSIC_VOID:
Expand All @@ -9953,7 +9865,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane:
return PerformVLDCombine(N, DCI);
return CombineBaseUpdate(N, DCI);
default: break;
}
break;
Expand Down
17 changes: 10 additions & 7 deletions test/CodeGen/ARM/alloc-no-stack-realign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
entry:
; NO-REALIGN-LABEL: test1
; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
Expand All @@ -21,14 +21,16 @@ entry:
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]

; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
%retval = alloca <16 x float>, align 16
%0 = load <16 x float>* @T3_retval, align 16
Expand All @@ -42,8 +44,8 @@ define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
entry:
; REALIGN-LABEL: test2
; REALIGN: bfc sp, #0, #6
; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1:[0-9]+]]:128]
; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #16
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
Expand All @@ -63,7 +65,8 @@ entry:
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #16
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
%retval = alloca <16 x float>, align 16
%0 = load <16 x float>* @T3_retval, align 16
Expand Down
15 changes: 9 additions & 6 deletions test/CodeGen/ARM/memcpy-inline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ entry:
; CHECK: movw [[REG2:r[0-9]+]], #16716
; CHECK: movt [[REG2:r[0-9]+]], #72
; CHECK: str [[REG2]], [r0, #32]
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
; CHECK: adds r0, #16
; CHECK: adds r1, #16
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
Expand All @@ -57,8 +59,10 @@ entry:
define void @t3(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t3:
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
; CHECK: adds r0, #16
; CHECK: adds r1, #16
; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
Expand All @@ -69,8 +73,7 @@ define void @t4(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t4:
; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]!
; CHECK: strh [[REG5:r[0-9]+]], [r0]
; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
ret void
}
Expand Down
Loading

0 comments on commit ff01f89

Please sign in to comment.