Skip to content

Commit

Permalink
[llvm] Disable f16 atomic hack for llvm 15. (taichi-dev#5756)
Browse files Browse the repository at this point in the history
* [llvm] Disable f16 atomic hack for llvm 15.

Limit TaskCodeGenCUDA::atomic_op_using_cas to llvm 10 build.

* Add comment for llvm15.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
python3kgae and pre-commit-ci[bot] authored Aug 16, 2022
1 parent d6bc375 commit 7e4a4ac
Showing 1 changed file with 13 additions and 37 deletions.
50 changes: 13 additions & 37 deletions taichi/codegen/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
{llvm_val[stmt->dest], llvm_val[stmt->val]});
}

// LLVM15 already support f16 atomic in
// https://github.com/llvm/llvm-project/commit/0cb08e448af7167ada767e0526aa44980e72ad08
// the review is at https://reviews.llvm.org/D52416
// Limit the f16 hack to LLVM10 build.
#ifndef TI_LLVM_15
// A huge hack for supporting f16 atomic add/max/min! Borrowed from
// https://github.com/tensorflow/tensorflow/blob/470d58a83470f8ede3beaa584e6992bc71b7baa6/tensorflow/compiler/xla/service/gpu/ir_emitter.cc#L378-L490
// The reason is that LLVM10 does not support generating atomicCAS for f16 on
Expand Down Expand Up @@ -263,8 +268,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
// *cas_new_output_address);
// } while (!success);
//
// TODO(sjwsl): Try to rewrite this after upgrading LLVM or supporting raw
// NVPTX

llvm::Value *atomic_op_using_cas(
llvm::Value *output_address,
Expand All @@ -275,19 +278,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
TI_ASSERT(output_address_type != nullptr);

// element_type is the data type for the binary operation.
#ifdef TI_LLVM_15
llvm::Type *element_address_type = nullptr;
if (output_address_type->isOpaquePointerTy()) {
element_address_type =
llvm::PointerType::get(output_address_type->getContext(), 0);
} else {
llvm::Type *element_type = output_address_type->getPointerElementType();
element_address_type = element_type->getPointerTo();
}
#else
llvm::Type *element_type = output_address_type->getPointerElementType();
llvm::Type *element_address_type = element_type->getPointerTo();
#endif

int atomic_size = 32;
llvm::Type *atomic_type = builder->getIntNTy(atomic_size);
Expand Down Expand Up @@ -326,11 +318,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {

// Use the value from the memory that atomicCAS operates on to initialize
// cas_old_output.
llvm::Value *cas_old_output = builder->CreateLoad(
#ifdef TI_LLVM_15
atomic_type,
#endif
atomic_memory_address, "cas_old_output");
llvm::Value *cas_old_output =
builder->CreateLoad(atomic_memory_address, "cas_old_output");
builder->CreateStore(cas_old_output, cas_old_output_address);

llvm::BasicBlock *loop_body_bb =
Expand All @@ -343,35 +332,21 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
// loop body for one atomicCAS
{
// Use cas_old_output to initialize cas_new_output.
cas_old_output = builder->CreateLoad(
#ifdef TI_LLVM_15
atomic_type,
#endif
cas_old_output_address, "cas_old_output");
cas_old_output =
builder->CreateLoad(cas_old_output_address, "cas_old_output");
builder->CreateStore(cas_old_output, cas_new_output_address);

auto binop_output = op(builder->CreateLoad(
#ifdef TI_LLVM_15
atomic_type,
#endif
binop_output_address),
val);
auto binop_output = op(builder->CreateLoad(binop_output_address), val);
builder->CreateStore(binop_output, binop_output_address);

llvm::Value *cas_new_output = builder->CreateLoad(
#ifdef TI_LLVM_15
atomic_type,
#endif
cas_new_output_address, "cas_new_output");
llvm::Value *cas_new_output =
builder->CreateLoad(cas_new_output_address, "cas_new_output");

// Emit code to perform the atomicCAS operation
// (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
// cas_new_output);
llvm::Value *ret_value = builder->CreateAtomicCmpXchg(
atomic_memory_address, cas_old_output, cas_new_output,
#ifdef TI_LLVM_15
llvm::MaybeAlign(0),
#endif
llvm::AtomicOrdering::SequentiallyConsistent,
llvm::AtomicOrdering::SequentiallyConsistent);

Expand All @@ -391,6 +366,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {

return output_address;
}
#endif

void visit(RangeForStmt *for_stmt) override {
create_naive_range_for(for_stmt);
Expand Down

0 comments on commit 7e4a4ac

Please sign in to comment.