Skip to content

Commit

Permalink
[LoopVectorize] Teach Loop Vectorizor about interleaved memory accesses.
Browse files Browse the repository at this point in the history
Interleaved memory accesses are grouped and vectorized into vector load/store and shufflevector.
E.g. for (i = 0; i < N; i+=2) {
       a = A[i];         // load of even element
       b = A[i+1];       // load of odd element
       ...               // operations on a, b, c, d
       A[i] = c;         // store of even element
       A[i+1] = d;       // store of odd element
     }

  The loads of even and odd elements are identified as an interleave load group, which will be transfered into vectorized IRs like:
     %wide.vec = load <8 x i32>, <8 x i32>* %ptr
     %vec.even = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
     %vec.odd = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>

  The stores of even and odd elements are identified as an interleave store group, which will be transfered into vectorized IRs like:
     %interleaved.vec = shufflevector <4 x i32> %vec.even, %vec.odd, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 
     store <8 x i32> %interleaved.vec, <8 x i32>* %ptr

This optimization is currently disabled by defaut. To try it by adding '-enable-interleaved-mem-accesses=true'. 



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239291 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
Hao Liu authored and Hao Liu committed Jun 8, 2015
1 parent f57b360 commit 43be1d5
Show file tree
Hide file tree
Showing 9 changed files with 1,299 additions and 29 deletions.
5 changes: 5 additions & 0 deletions include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,11 @@ const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
const ValueToValueMap &PtrToStride,
Value *Ptr, Value *OrigPtr = nullptr);

/// \brief Check the stride of the pointer and ensure that it does not wrap in
/// the address space.
int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
const ValueToValueMap &StridesMap);

/// \brief This analysis provides dependence information for the memory accesses
/// of a loop.
///
Expand Down
27 changes: 27 additions & 0 deletions include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,20 @@ class TargetTransformInfo {
unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const;

/// \return The cost of the interleaved memory operation.
/// \p Opcode is the memory operation code
/// \p VecTy is the vector type of the interleaved access.
/// \p Factor is the interleave factor
/// \p Indices is the indices for interleaved load members (as interleaved
/// load allows gaps)
/// \p Alignment is the alignment of the memory operation
/// \p AddressSpace is address space of the pointer.
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) const;

/// \brief Calculate the cost of performing a vector reduction.
///
/// This is the cost of reducing the vector value of type \p Ty to a scalar
Expand Down Expand Up @@ -587,6 +601,11 @@ class TargetTransformInfo::Concept {
virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment,
unsigned AddressSpace) = 0;
virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) = 0;
virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) = 0;
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
Expand Down Expand Up @@ -748,6 +767,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned AddressSpace) override {
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) override {
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
unsigned getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) override {
return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Expand Down
8 changes: 8 additions & 0 deletions include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,14 @@ class TargetTransformInfoImplBase {
return 1;
}

unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) {
return 1;
}

unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) {
return 1;
Expand Down
67 changes: 67 additions & 0 deletions include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,73 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}

unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) {
VectorType *VT = dyn_cast<VectorType>(VecTy);
assert(VT && "Expect a vector type for interleaved memory op");

unsigned NumElts = VT->getNumElements();
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");

unsigned NumSubElts = NumElts / Factor;
VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);

// Firstly, the cost of load/store operation.
unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);

// Then plus the cost of interleave operation.
if (Opcode == Instruction::Load) {
// The interleave cost is similar to extract sub vectors' elements
// from the wide vector, and insert them into sub vectors.
//
// E.g. An interleaved load of factor 2 (with one member of index 0):
// %vec = load <8 x i32>, <8 x i32>* %ptr
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.

assert(Indices.size() <= Factor &&
"Interleaved memory op has too many members");
for (unsigned Index : Indices) {
assert(Index < Factor && "Invalid index for interleaved memory op");

// Extract elements from loaded vector for each sub vector.
for (unsigned i = 0; i < NumSubElts; i++)
Cost += getVectorInstrCost(Instruction::ExtractElement, VT,
Index + i * Factor);
}

unsigned InsSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)
InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i);

Cost += Indices.size() * InsSubCost;
} else {
// The interleave cost is extract all elements from sub vectors, and
// insert them into the wide vector.
//
// E.g. An interleaved store of factor 2:
// %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
// store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
// The cost is estimated as extract all elements from both <4 x i32>
// vectors and insert into the <8 x i32> vector.

unsigned ExtSubCost = 0;
for (unsigned i = 0; i < NumSubElts; i++)
ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i);

Cost += Factor * ExtSubCost;

for (unsigned i = 0; i < NumElts; i++)
Cost += getVectorInstrCost(Instruction::InsertElement, VT, i);
}

return Cost;
}

unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) {
unsigned ISD = 0;
Expand Down
9 changes: 2 additions & 7 deletions lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,6 @@ static bool hasComputableBounds(ScalarEvolution *SE,
return AR->isAffine();
}

/// \brief Check the stride of the pointer and ensure that it does not wrap in
/// the address space.
static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
const ValueToValueMap &StridesMap);

bool AccessAnalysis::canCheckPtrAtRT(
LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons,
ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap,
Expand Down Expand Up @@ -510,8 +505,8 @@ static bool isInBoundsGep(Value *Ptr) {
}

/// \brief Check whether the access through \p Ptr has a constant stride.
static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
const ValueToValueMap &StridesMap) {
int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
const ValueToValueMap &StridesMap) {
const Type *Ty = Ptr->getType();
assert(Ty->isPointerTy() && "Unexpected non-ptr");

Expand Down
7 changes: 7 additions & 0 deletions lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}

unsigned TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace) const {
return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}

unsigned
TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) const {
Expand Down
Loading

0 comments on commit 43be1d5

Please sign in to comment.