forked from llvm-mirror/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize away unnecessary address casts.
Removes unnecessary casts from non-generic address spaces to the generic address space for certain code patterns. Patch by Jingyue Wu. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205571 91177308-0d34-0410-b5e6-96231b3b80d8
- Loading branch information
Showing
6 changed files
with
299 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===// | ||
// | ||
// The LLVM Compiler Infrastructure | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// When a load/store accesses the generic address space, checks whether the | ||
// address is casted from a non-generic address space. If so, remove this | ||
// addrspacecast because accessing non-generic address spaces is typically | ||
// faster. Besides seeking addrspacecasts, this optimization also traces into | ||
// the base pointer of a GEP. | ||
// | ||
// For instance, the code below loads a float from an array allocated in | ||
// addrspace(3). | ||
// | ||
// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* | ||
// %1 = gep [10 x float]* %0, i64 0, i64 %i | ||
// %2 = load float* %1 ; emits ld.f32 | ||
// | ||
// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast | ||
// and the GEP to expose more optimization opportunities to function | ||
// optimizeMemoryInst. The intermediate code looks like: | ||
// | ||
// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i | ||
// %1 = addrspacecast float addrspace(3)* %0 to float* | ||
// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly | ||
// | ||
// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed | ||
// generic pointers, and folds the load and the addrspacecast into a load from | ||
// the original address space. The final code looks like: | ||
// | ||
// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i | ||
// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32 | ||
// | ||
// This pass may remove an addrspacecast in a different BB. Therefore, we | ||
// implement it as a FunctionPass. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "NVPTX.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/IR/Instructions.h" | ||
#include "llvm/IR/Operator.h" | ||
#include "llvm/Support/CommandLine.h" | ||
|
||
using namespace llvm; | ||
|
||
// An option to disable this optimization. Enable it by default. | ||
static cl::opt<bool> DisableFavorNonGeneric( | ||
"disable-nvptx-favor-non-generic", | ||
cl::init(false), | ||
cl::desc("Do not convert generic address space usage " | ||
"to non-generic address space usage"), | ||
cl::Hidden); | ||
|
||
namespace { | ||
/// \brief NVPTXFavorNonGenericAddrSpaces | ||
class NVPTXFavorNonGenericAddrSpaces : public FunctionPass { | ||
public: | ||
static char ID; | ||
NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {} | ||
|
||
virtual bool runOnFunction(Function &F) override; | ||
|
||
/// Optimizes load/store instructions. Idx is the index of the pointer operand | ||
/// (0 for load, and 1 for store). Returns true if it changes anything. | ||
bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); | ||
/// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, | ||
/// indices)". This reordering exposes to optimizeMemoryInstruction more | ||
/// optimization opportunities on loads and stores. Returns true if it changes | ||
/// the program. | ||
bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP); | ||
}; | ||
} | ||
|
||
char NVPTXFavorNonGenericAddrSpaces::ID = 0; | ||
|
||
namespace llvm { | ||
void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); | ||
} | ||
INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic", | ||
"Remove unnecessary non-generic-to-generic addrspacecasts", | ||
false, false) | ||
|
||
// Decides whether removing Cast is valid and beneficial. Cast can be an | ||
// instruction or a constant expression. | ||
static bool IsEliminableAddrSpaceCast(Operator *Cast) { | ||
// Returns false if not even an addrspacecast. | ||
if (Cast->getOpcode() != Instruction::AddrSpaceCast) | ||
return false; | ||
|
||
Value *Src = Cast->getOperand(0); | ||
PointerType *SrcTy = cast<PointerType>(Src->getType()); | ||
PointerType *DestTy = cast<PointerType>(Cast->getType()); | ||
// TODO: For now, we only handle the case where the addrspacecast only changes | ||
// the address space but not the type. If the type also changes, we could | ||
// still get rid of the addrspacecast by adding an extra bitcast, but we | ||
// rarely see such scenarios. | ||
if (SrcTy->getElementType() != DestTy->getElementType()) | ||
return false; | ||
|
||
// Checks whether the addrspacecast is from a non-generic address space to the | ||
// generic address space. | ||
return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC && | ||
DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); | ||
} | ||
|
||
bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( | ||
GEPOperator *GEP) { | ||
Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand()); | ||
if (Cast == nullptr) | ||
return false; | ||
|
||
if (!IsEliminableAddrSpaceCast(Cast)) | ||
return false; | ||
|
||
SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end()); | ||
if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) { | ||
// %1 = gep (addrspacecast X), indices | ||
// => | ||
// %0 = gep X, indices | ||
// %1 = addrspacecast %0 | ||
GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0), | ||
Indices, | ||
GEP->getName(), | ||
GEPI); | ||
NewGEPI->setIsInBounds(GEP->isInBounds()); | ||
GEP->replaceAllUsesWith( | ||
new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); | ||
} else { | ||
// GEP is a constant expression. | ||
Constant *NewGEPCE = ConstantExpr::getGetElementPtr( | ||
cast<Constant>(Cast->getOperand(0)), | ||
Indices, | ||
GEP->isInBounds()); | ||
GEP->replaceAllUsesWith( | ||
ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType())); | ||
} | ||
|
||
return true; | ||
} | ||
|
||
bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, | ||
unsigned Idx) { | ||
// If the pointer operand is a GEP, hoist the addrspacecast if any from the | ||
// GEP to expose more optimization opportunites. | ||
if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) { | ||
hoistAddrSpaceCastFromGEP(GEP); | ||
} | ||
|
||
// load/store (addrspacecast X) => load/store X if shortcutting the | ||
// addrspacecast is valid and can improve performance. | ||
// | ||
// e.g., | ||
// %1 = addrspacecast float addrspace(3)* %0 to float* | ||
// %2 = load float* %1 | ||
// -> | ||
// %2 = load float addrspace(3)* %0 | ||
// | ||
// Note: the addrspacecast can also be a constant expression. | ||
if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) { | ||
if (IsEliminableAddrSpaceCast(Cast)) { | ||
MI->setOperand(Idx, Cast->getOperand(0)); | ||
return true; | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { | ||
if (DisableFavorNonGeneric) | ||
return false; | ||
|
||
bool Changed = false; | ||
for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { | ||
for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) { | ||
if (isa<LoadInst>(I)) { | ||
// V = load P | ||
Changed |= optimizeMemoryInstruction(I, 0); | ||
} else if (isa<StoreInst>(I)) { | ||
// store V, P | ||
Changed |= optimizeMemoryInstruction(I, 1); | ||
} | ||
} | ||
} | ||
return Changed; | ||
} | ||
|
||
FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() { | ||
return new NVPTXFavorNonGenericAddrSpaces(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX | ||
; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR | ||
|
||
@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 | ||
@scalar = internal addrspace(3) global float 0.000000e+00, align 4 | ||
|
||
; Verifies nvptx-favor-non-generic correctly optimizes generic address space | ||
; usage to non-generic address space usage for the patterns we claim to handle: | ||
; 1. load cast | ||
; 2. store cast | ||
; 3. load gep cast | ||
; 4. store gep cast | ||
; gep and cast can be an instruction or a constant expression. This function | ||
; tries all possible combinations. | ||
define float @ld_st_shared_f32(i32 %i, float %v) { | ||
; IR-LABEL: @ld_st_shared_f32 | ||
; IR-NOT: addrspacecast | ||
; PTX-LABEL: ld_st_shared_f32( | ||
; load cast | ||
%1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 | ||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; | ||
; store cast | ||
store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 | ||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; | ||
; use syncthreads to disable optimizations across components | ||
call void @llvm.cuda.syncthreads() | ||
; PTX: bar.sync 0; | ||
|
||
; cast; load | ||
%2 = addrspacecast float addrspace(3)* @scalar to float* | ||
%3 = load float* %2, align 4 | ||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; | ||
; cast; store | ||
store float %v, float* %2, align 4 | ||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; | ||
call void @llvm.cuda.syncthreads() | ||
; PTX: bar.sync 0; | ||
|
||
; load gep cast | ||
%4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 | ||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; | ||
; store gep cast | ||
store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 | ||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; | ||
call void @llvm.cuda.syncthreads() | ||
; PTX: bar.sync 0; | ||
|
||
; gep cast; load | ||
%5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5 | ||
%6 = load float* %5, align 4 | ||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; | ||
; gep cast; store | ||
store float %v, float* %5, align 4 | ||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; | ||
call void @llvm.cuda.syncthreads() | ||
; PTX: bar.sync 0; | ||
|
||
; cast; gep; load | ||
%7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]* | ||
%8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i | ||
%9 = load float* %8, align 4 | ||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}]; | ||
; cast; gep; store | ||
store float %v, float* %8, align 4 | ||
; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}}; | ||
call void @llvm.cuda.syncthreads() | ||
; PTX: bar.sync 0; | ||
|
||
%sum2 = fadd float %1, %3 | ||
%sum3 = fadd float %sum2, %4 | ||
%sum4 = fadd float %sum3, %6 | ||
%sum5 = fadd float %sum4, %9 | ||
ret float %sum5 | ||
} | ||
|
||
; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of | ||
; different element types. | ||
define i32 @ld_int_from_float() { | ||
; IR-LABEL: @ld_int_from_float | ||
; IR: addrspacecast | ||
; PTX-LABEL: ld_int_from_float( | ||
; PTX: cvta.shared.u{{(32|64)}} | ||
%1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4 | ||
ret i32 %1 | ||
} | ||
|
||
declare void @llvm.cuda.syncthreads() #3 | ||
|
||
attributes #3 = { noduplicate nounwind } | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters