Skip to content

Commit

Permalink
Emit num_threads added. Nested Parallel regions compile.
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-schuermann committed Dec 5, 2017
1 parent e8e9d01 commit 7167f04
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 35 deletions.
156 changes: 121 additions & 35 deletions lib/CodeGen/CGOpenMPRuntimeSPIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ using namespace CodeGen;
CGOpenMPRuntimeSPIR::CGOpenMPRuntimeSPIR(CodeGenModule &CGM)
: CGOpenMPRuntime(CGM) {
MasterContBlock = nullptr;
NumThreadsContBlock = nullptr;
inParallel = false;
std::cout << "using SPIR\n";
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP SPIR can only handle device code.");
std::cout << std::string(CGM.getDataLayout().getStringRepresentation()) << "\n";

}

llvm::Constant * CGOpenMPRuntimeSPIR::createRuntimeFunction(OpenMPRTLFunctionSPIR Function) {
Expand Down Expand Up @@ -57,11 +58,20 @@ llvm::Constant * CGOpenMPRuntimeSPIR::createRuntimeFunction(OpenMPRTLFunctionSPI

// Get the same Type in a given Address Space
QualType CGOpenMPRuntimeSPIR::getAddrSpaceType(QualType T, LangAS::ID AddrSpace) {
if(T.getTypePtr()->isAnyPointerType() || T.getTypePtr()->isLValueReferenceType()) {
if(T.getTypePtr()->isLValueReferenceType())
return CGM.getContext().getLValueReferenceType(getAddrSpaceType(T.getTypePtr()->getPointeeType(), AddrSpace), true);
if(T.getTypePtr()->isAnyPointerType())
return CGM.getContext().getPointerType(getAddrSpaceType(T.getTypePtr()->getPointeeType(), AddrSpace));
} else {
if(T.getTypePtr()->isBuiltinType())
return CGM.getContext().getAddrSpaceQualType(T, AddrSpace);
return T;
}

bool CGOpenMPRuntimeSPIR::isGlobal(IdentifierInfo * info) {
for(IdentifierInfo * capture : captures) {
if(capture == info) return true;
}
return false;
}

void CGOpenMPRuntimeSPIR::emitMasterHeader(CodeGenFunction &CGF) {
Expand All @@ -88,6 +98,30 @@ void CGOpenMPRuntimeSPIR::emitMasterFooter(CodeGenFunction &CGF) {
return;
}

void CGOpenMPRuntimeSPIR::emitNumThreadsHeader(CodeGenFunction &CGF, llvm::Value *NumThreads) {
assert(NumThreadsContBlock == nullptr);
llvm::Value *arg[] = {CGF.Builder.getInt32(0)};
llvm::CallInst *ltid = CGF.EmitRuntimeCall(createRuntimeFunction(get_local_id), arg);
llvm::Value *ltid_casted = CGF.Builder.CreateTruncOrBitCast(ltid, CGF.Int32Ty);
llvm::Value *cond = CGF.Builder.CreateICmpSLT(ltid_casted, NumThreads);
llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then");
NumThreadsContBlock = CGF.createBasicBlock("omp_if.end");
// Generate the branch (If-stmt)
CGF.Builder.CreateCondBr(cond, ThenBlock, NumThreadsContBlock);
CGF.EmitBlock(ThenBlock);
return;
}

void CGOpenMPRuntimeSPIR::emitNumThreadsFooter(CodeGenFunction &CGF) {
// only close num_threads region, if there is one
if(NumThreadsContBlock == nullptr)
return;
CGF.EmitBranch(NumThreadsContBlock);
CGF.EmitBlock(NumThreadsContBlock, true);
NumThreadsContBlock = nullptr;
return;
}

void CGOpenMPRuntimeSPIR::setTargetParallel(OpenMPDirectiveKind kind) {
switch(kind) {
case OpenMPDirectiveKind::OMPD_target_parallel:
Expand Down Expand Up @@ -287,6 +321,7 @@ void CGOpenMPRuntimeSPIR::emitTargetOutlinedFunction(
// TODO: are we sure to always have a Pointer here?
QualType ArgType = FD->getType();
FD->setType(getAddrSpaceType(ArgType, LangAS::opencl_global));
captures.push_back(I->getCapturedVar()->getIdentifier());
++I;
}

Expand All @@ -313,7 +348,6 @@ void CGOpenMPRuntimeSPIR::emitTargetOutlinedFunction(
OutlinedFn->setCallingConv(llvm::CallingConv::SPIR_KERNEL);
OutlinedFn->addFnAttr(llvm::Attribute::NoUnwind);
OutlinedFn->removeFnAttr(llvm::Attribute::OptimizeNone);
OutlinedFn->dump();

CodeGenFunction CGF(CGM);
GenOpenCLArgMetadata(CS.getCapturedRecordDecl(), OutlinedFn,
Expand All @@ -327,51 +361,87 @@ llvm::Value *CGOpenMPRuntimeSPIR::emitParallelOutlinedFunction(
const CapturedStmt *CS = D.getCapturedStmt(OMPD_parallel);

const RecordDecl *RD = CS->getCapturedRecordDecl();
for (auto *FD : RD->fields()) {
FD->setType(getAddrSpaceType(FD->getType(), LangAS::opencl_global));
auto I = CS->captures().begin();
for (FieldDecl *FD : RD->fields()) {
if(isGlobal(I->getCapturedVar()->getIdentifier())) {
FD->setType(getAddrSpaceType(FD->getType(), LangAS::opencl_global));
}
++I;
}
std::cout << "emit parallel outlined function\n";
bool wasAlreadyParallel = inParallel;
inParallel = true;
llvm::Value *OutlinedFn = CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, InnermostKind, CodeGen);
OutlinedFn->dump();
inParallel = wasAlreadyParallel;
if (auto Fn = dyn_cast<llvm::Function>(OutlinedFn)) {
Fn->removeFnAttr(llvm::Attribute::NoInline);
Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
}
return OutlinedFn;
}

void CGOpenMPRuntimeSPIR::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond) {

if (!CGF.HaveInsertPoint())
return;

if (auto Fn = dyn_cast<llvm::Function>(OutlinedFn)) {
Fn->removeFnAttr(llvm::Attribute::NoInline);
Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
Fn->addFnAttr(llvm::Attribute::OptimizeForSize); //TODO try?
}
std::cout << "emit parallel master footer\n";
emitMasterFooter(CGF);
// TODO: Better remove these unnecessary arguments?
llvm::Value * arg[] = { CGF.Builder.getInt32(0) };
llvm::CallInst * gtid = CGF.EmitRuntimeCall(createRuntimeFunction(get_global_id), arg);
Address global_tid = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), ".gtid");
llvm::Value * gtid_casted = CGF.Builder.CreateTruncOrBitCast(gtid, CGF.Int32Ty);
CGF.EmitStoreOfScalar(gtid_casted, CGF.MakeAddrLValue(global_tid, CGF.getContext().getIntPtrType()), true);

llvm::CallInst * ltid = CGF.EmitRuntimeCall(createRuntimeFunction(get_local_id), arg);
Address local_tid = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), ".btid");
llvm::Value * ltid_casted = CGF.Builder.CreateTruncOrBitCast(ltid, CGF.Int32Ty);
CGF.EmitStoreOfScalar(ltid_casted, CGF.MakeAddrLValue(local_tid, CGF.getContext().getIntPtrType()), true);

llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(global_tid.getPointer()); // global_tid
OutlinedFnArgs.push_back(local_tid.getPointer()); // bound_tid
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
// TODO: clean up this mess :)
if(inParallel) {

std::cout << "call inner parallel function\n";

auto &&ThenGen = [OutlinedFn, CapturedVars, this](CodeGenFunction &CGF,
PrePostActionTy &) {
llvm::SmallVector<llvm::Value *, 16> RealArgs;
RealArgs.push_back(
llvm::ConstantPointerNull::get(CGF.CGM.Int32Ty->getPointerTo()));
RealArgs.push_back(
llvm::ConstantPointerNull::get(CGF.CGM.Int32Ty->getPointerTo()));
RealArgs.append(CapturedVars.begin(), CapturedVars.end());

CGF.EmitCallOrInvoke(OutlinedFn, RealArgs);
if(this->NumThreadsContBlock)
this->emitNumThreadsFooter(CGF);
};
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);

}else {
auto &&ThenGen = [OutlinedFn, CapturedVars, this](CodeGenFunction &CGF,
PrePostActionTy &) {
emitMasterFooter(CGF);
// TODO: Better remove these unnecessary arguments?
llvm::Value *arg[] = {CGF.Builder.getInt32(0)};
llvm::CallInst *gtid = CGF.EmitRuntimeCall(this->createRuntimeFunction(get_global_id), arg);
Address global_tid = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), ".gtid");
llvm::Value *gtid_casted = CGF.Builder.CreateTruncOrBitCast(gtid, CGF.Int32Ty);
CGF.EmitStoreOfScalar(gtid_casted, CGF.MakeAddrLValue(global_tid, CGF.getContext().getIntPtrType()), true);

llvm::CallInst *ltid = CGF.EmitRuntimeCall(this->createRuntimeFunction(get_local_id), arg);
Address local_tid = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), ".btid");
llvm::Value *ltid_casted = CGF.Builder.CreateTruncOrBitCast(ltid, CGF.Int32Ty);
CGF.EmitStoreOfScalar(ltid_casted, CGF.MakeAddrLValue(local_tid, CGF.getContext().getIntPtrType()), true);

llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

OutlinedFnArgs.push_back(global_tid.getPointer()); // global_tid
OutlinedFnArgs.push_back(local_tid.getPointer()); // bound_tid
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

for(auto arg : OutlinedFnArgs) arg->dump();
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
std::cout << "emit parallel master header\n";
if(!isTargetParallel) // TODO: we could leave that in and let the optimizer do this for us
emitMasterHeader(CGF);

if(this->NumThreadsContBlock)
this->emitNumThreadsFooter(CGF); // close num_threads clause, if there is one
if(!this->isTargetParallel && !this->inParallel) // TODO: we could leave that in and let the optimizer do this for us
this->emitMasterHeader(CGF);
};
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
}


Expand Down Expand Up @@ -570,6 +640,22 @@ void CGOpenMPRuntimeSPIR::emitDistributeStaticInit(
void CGOpenMPRuntimeSPIR::emitForStaticFinish(CodeGenFunction &CGF,
SourceLocation Loc) {}

void CGOpenMPRuntimeSPIR::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
// only emit this clause if it is the outermost parallel construct
if(inParallel)
return;
// principle: if(thread_id < NumThreads) {...}
emitNumThreadsHeader(CGF, NumThreads);
// Footer must be emitted by end of parallel region
}

void CGOpenMPRuntimeSPIR::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {}

void CGOpenMPRuntimeSPIR::emitForDispatchInit(
CodeGenFunction &CGF, SourceLocation Loc,
const OpenMPScheduleTy &ScheduleKind, unsigned IVSize, bool IVSigned,
Expand Down
23 changes: 23 additions & 0 deletions lib/CodeGen/CGOpenMPRuntimeSPIR.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "CGOpenMPRuntime.h"
#include "CodeGenFunction.h"
#include "clang/Basic/IdentifierTable.h"
#include "clang/AST/StmtOpenMP.h"
#include "llvm/IR/CallSite.h"

Expand All @@ -36,13 +37,19 @@ class CGOpenMPRuntimeSPIR : public CGOpenMPRuntime {
};
// TODO: comment these functions!
QualType getAddrSpaceType(QualType T, LangAS::ID AddrSpace);
bool isGlobal(IdentifierInfo * info);
SmallVector<IdentifierInfo *, 16> captures;
llvm::BasicBlock * MasterContBlock;
llvm::BasicBlock * NumThreadsContBlock;
void emitMasterHeader(CodeGenFunction &CGF);
void emitMasterFooter(CodeGenFunction &CGF);
void emitNumThreadsHeader(CodeGenFunction &CGF, llvm::Value *NumThreads);
void emitNumThreadsFooter(CodeGenFunction &CGF);
void GenOpenCLArgMetadata(const RecordDecl *FD, llvm::Function *Fn, llvm::LLVMContext &Context, CGBuilderTy &Builder);
llvm::Constant *createRuntimeFunction(OpenMPRTLFunctionSPIR Function);
void setTargetParallel(OpenMPDirectiveKind kind);
bool isTargetParallel;
bool inParallel;

public:
explicit CGOpenMPRuntimeSPIR(CodeGenModule &CGM);
Expand Down Expand Up @@ -205,6 +212,22 @@ class CGOpenMPRuntimeSPIR : public CGOpenMPRuntime {
unsigned IVSize, bool IVSigned, bool Ordered,
const DispatchRTInput &DispatchValues);

/// \brief Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
/// clause.
/// \param NumThreads An integer value of threads.
virtual void emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc);

/// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code
/// for num_teams clause.
/// \param NumTeams An integer expression of teams.
/// \param ThreadLimit An integer expression of threads.
virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
const Expr *ThreadLimit, SourceLocation Loc);

/// \brief Check if the specified \a ScheduleKind is static non-chunked.
/// This kind of worksharing directive is emitted without outer loop.
/// \param ScheduleKind Schedule kind specified in the 'schedule' clause.
Expand Down

0 comments on commit 7167f04

Please sign in to comment.