[llvm] Disable f16 atomic hack for llvm 15. (taichi-dev#5756)

* [llvm] Disable f16 atomic hack for llvm 15. Limit TaskCodeGenCUDA::atomic_op_using_cas to llvm 10 build. * Add comment for llvm15. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
ares201005 · Aug 16, 2022 · 7e4a4ac · 7e4a4ac
1 parent d6bc375
commit 7e4a4ac
Showing 1 changed file with 13 additions and 37 deletions.
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -218,6 +218,11 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
                        {llvm_val[stmt->dest], llvm_val[stmt->val]});
   }
 
+  // LLVM15 already support f16 atomic in
+  // https://github.com/llvm/llvm-project/commit/0cb08e448af7167ada767e0526aa44980e72ad08
+  // the review is at https://reviews.llvm.org/D52416
+  // Limit the f16 hack to LLVM10 build.
+#ifndef TI_LLVM_15
   // A huge hack for supporting f16 atomic add/max/min! Borrowed from
   // https://github.com/tensorflow/tensorflow/blob/470d58a83470f8ede3beaa584e6992bc71b7baa6/tensorflow/compiler/xla/service/gpu/ir_emitter.cc#L378-L490
   // The reason is that LLVM10 does not support generating atomicCAS for f16 on
@@ -263,8 +268,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
   //       *cas_new_output_address);
   //   } while (!success);
   //
-  // TODO(sjwsl): Try to rewrite this after upgrading LLVM or supporting raw
-  // NVPTX
 
   llvm::Value *atomic_op_using_cas(
       llvm::Value *output_address,
@@ -275,19 +278,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     TI_ASSERT(output_address_type != nullptr);
 
     // element_type is the data type for the binary operation.
-#ifdef TI_LLVM_15
-    llvm::Type *element_address_type = nullptr;
-    if (output_address_type->isOpaquePointerTy()) {
-      element_address_type =
-          llvm::PointerType::get(output_address_type->getContext(), 0);
-    } else {
-      llvm::Type *element_type = output_address_type->getPointerElementType();
-      element_address_type = element_type->getPointerTo();
-    }
-#else
     llvm::Type *element_type = output_address_type->getPointerElementType();
     llvm::Type *element_address_type = element_type->getPointerTo();
-#endif
 
     int atomic_size = 32;
     llvm::Type *atomic_type = builder->getIntNTy(atomic_size);
@@ -326,11 +318,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
 
     // Use the value from the memory that atomicCAS operates on to initialize
     // cas_old_output.
-    llvm::Value *cas_old_output = builder->CreateLoad(
-#ifdef TI_LLVM_15
-        atomic_type,
-#endif
-        atomic_memory_address, "cas_old_output");
+    llvm::Value *cas_old_output =
+        builder->CreateLoad(atomic_memory_address, "cas_old_output");
     builder->CreateStore(cas_old_output, cas_old_output_address);
 
     llvm::BasicBlock *loop_body_bb =
@@ -343,35 +332,21 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     // loop body for one atomicCAS
     {
       // Use cas_old_output to initialize cas_new_output.
-      cas_old_output = builder->CreateLoad(
-#ifdef TI_LLVM_15
-          atomic_type,
-#endif
-          cas_old_output_address, "cas_old_output");
+      cas_old_output =
+          builder->CreateLoad(cas_old_output_address, "cas_old_output");
       builder->CreateStore(cas_old_output, cas_new_output_address);
 
-      auto binop_output = op(builder->CreateLoad(
-#ifdef TI_LLVM_15
-                                 atomic_type,
-#endif
-                                 binop_output_address),
-                             val);
+      auto binop_output = op(builder->CreateLoad(binop_output_address), val);
       builder->CreateStore(binop_output, binop_output_address);
 
-      llvm::Value *cas_new_output = builder->CreateLoad(
-#ifdef TI_LLVM_15
-          atomic_type,
-#endif
-          cas_new_output_address, "cas_new_output");
+      llvm::Value *cas_new_output =
+          builder->CreateLoad(cas_new_output_address, "cas_new_output");
 
       // Emit code to perform the atomicCAS operation
       // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
       //                                       cas_new_output);
       llvm::Value *ret_value = builder->CreateAtomicCmpXchg(
           atomic_memory_address, cas_old_output, cas_new_output,
-#ifdef TI_LLVM_15
-          llvm::MaybeAlign(0),
-#endif
           llvm::AtomicOrdering::SequentiallyConsistent,
           llvm::AtomicOrdering::SequentiallyConsistent);
 
@@ -391,6 +366,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
 
     return output_address;
   }
+#endif
 
   void visit(RangeForStmt *for_stmt) override {
     create_naive_range_for(for_stmt);