[Perf] Support thread local storage for reduction in struct-fors (tai…

…chi-dev#1941) Co-authored-by: Ye Kuang <[email protected]>
jhxu-org · Oct 12, 2020 · 2ccf0f5 · 2ccf0f5
1 parent 183e947
commit 2ccf0f5
Show file tree

Hide file tree

Showing 12 changed files with 183 additions and 72 deletions.
diff --git a/misc/async_mgpcg.py b/misc/async_mgpcg.py
@@ -15,7 +15,7 @@
         )
 
 # grid parameters
-N = 128
+N = 256
 
 n_mg_levels = 5
 pre_and_post_smoothing = 2
@@ -40,12 +40,15 @@
 new_zTr = ti.field(dtype=real, shape=())
 pAp = ti.field(dtype=real, shape=())
 
-grid = ti.root.pointer(ti.ijk, [N_tot // 4]).dense(ti.ijk, 4).place(x, p, Ap)
+leaf_size = 8
+
+grid = ti.root.pointer(ti.ijk,
+                       [N_tot // leaf_size]).dense(ti.ijk,
+                                                   leaf_size).place(x, p, Ap)
 
 for l in range(n_mg_levels):
-    grid = ti.root.pointer(ti.ijk,
-                           [N_tot // (4 * 2**l)]).dense(ti.ijk,
-                                                        4).place(r[l], z[l])
+    grid = ti.root.pointer(ti.ijk, [N_tot // (leaf_size * 2**l)]).dense(
+        ti.ijk, leaf_size).place(r[l], z[l])
 
 ti.root.place(alpha, beta, sum)
 

diff --git a/python/taichi/testing.py b/python/taichi/testing.py
@@ -15,7 +15,7 @@ def get_rel_eps():
 
 
 def approx(expected, **kwargs):
-    '''Tweaked pytest.approx for OpenGL low percisions'''
+    '''Tweaked pytest.approx for OpenGL low precisions'''
     import pytest
 
     class boolean_integer:

diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
@@ -1,4 +1,4 @@
-#include "codegen_cpu.h"
+#include "taichi/backends/cpu/codegen_cpu.h"
 
 #include "taichi/codegen/codegen_llvm.h"
 #include "taichi/common/core.h"
@@ -30,6 +30,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
 
     auto *tls_prologue = create_xlogue(stmt->tls_prologue);
 
+    // The loop body
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
@@ -1,4 +1,4 @@
-#include "codegen_llvm.h"
+#include "taichi/codegen/codegen_llvm.h"
 
 #include "taichi/struct/struct_llvm.h"
 #include "taichi/util/file_sequence_writer.h"
@@ -44,8 +44,8 @@ FunctionCreationGuard::FunctionCreationGuard(
       llvm::Type::getVoidTy(*mb->llvm_context), arguments, false);
 
   body = llvm::Function::Create(body_function_type,
-                                llvm::Function::InternalLinkage, "loop_body",
-                                mb->module.get());
+                                llvm::Function::InternalLinkage,
+                                "function_body", mb->module.get());
   old_func = mb->func;
   // emit into loop body function
   mb->func = body;
@@ -59,7 +59,8 @@ FunctionCreationGuard::FunctionCreationGuard(
   ip = mb->builder->saveIP();
   mb->builder->SetInsertPoint(entry);
 
-  auto body_bb = BasicBlock::Create(*mb->llvm_context, "loop_body", mb->func);
+  auto body_bb =
+      BasicBlock::Create(*mb->llvm_context, "function_body", mb->func);
   mb->builder->CreateBr(body_bb);
   mb->builder->SetInsertPoint(body_bb);
 }
@@ -1313,37 +1314,82 @@ std::tuple<llvm::Value *, llvm::Value *> CodeGenLLVM::get_range_for_bounds(
 }
 
 void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
-  llvm::Function *body;
+  // TODO: instead of constructing tons of LLVM IR, writing the logic in
+  // runtime.cpp may be a cleaner solution. See
+  // CodeGenLLVMCPU::create_offload_range_for as an example.
+
+  llvm::Function *body = nullptr;
   auto leaf_block = stmt->snode;
   {
     // Create the loop body function
     auto guard = get_function_creation_guard({
         llvm::PointerType::get(get_runtime_type("Context"), 0),
+        get_tls_buffer_type(),
         llvm::PointerType::get(get_runtime_type("Element"), 0),
         tlctx->get_data_type<int>(),
         tlctx->get_data_type<int>(),
     });
 
     body = guard.body;
 
-    // per-leaf-block for loop
+    /* Function structure:
+     *
+     * function_body (entry):
+     *   loop_index = lower_bound;
+     *   tls_prologue()
+     *   bls_prologue()
+     *   goto loop_test
+     *
+     * loop_test:
+     *   if (loop_index < upper_bound)
+     *     goto loop_body
+     *   else
+     *     goto func_exit
+     *
+     * loop_body:
+     *   initialize_coordinates()
+     *   if (bitmasked voxel is active)
+     *     goto struct_for_body
+     *   else
+     *     goto loop_body_tail
+     *
+     * struct_for_body:
+     *   ... (Run codegen on the StructForStmt::body Taichi Block)
+     *   goto loop_body_tail
+     *
+     * loop_body_tail:
+     *   loop_index += block_dim
+     *   goto loop_test
+     *
+     * func_exit:
+     *   bls_epilogue()
+     *   tls_epilogue()
+     *   return
+     */
+
     auto loop_index =
         create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));
 
-    llvm::Value *thread_idx = nullptr, *block_dim = nullptr;
+    RuntimeObject element("Element", this, builder.get(), get_arg(2));
 
-    RuntimeObject element("Element", this, builder.get(), get_arg(1));
-    auto lower_bound = get_arg(2);
-    auto upper_bound = get_arg(3);
+    // Loop ranges
+    auto lower_bound = get_arg(3);
+    auto upper_bound = get_arg(4);
 
     parent_coordinates = element.get_ptr("pcoord");
 
+    if (stmt->tls_prologue) {
+      stmt->tls_prologue->accept(this);
+    }
+
     if (stmt->bls_prologue) {
       call("block_barrier");  // "__syncthreads()"
       stmt->bls_prologue->accept(this);
       call("block_barrier");  // "__syncthreads()"
     }
 
+    llvm::Value *thread_idx = nullptr, *block_dim = nullptr;
+
     if (spmd) {
       thread_idx =
           builder->CreateIntrinsic(Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {});
@@ -1355,21 +1401,33 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
       builder->CreateStore(lower_bound, loop_index);
     }
 
-    // test bb
-    auto test_bb = BasicBlock::Create(*llvm_context, "test", func);
-    auto body_bb = BasicBlock::Create(*llvm_context, "loop_body", func);
-    auto after_loop = BasicBlock::Create(*llvm_context, "after_loop", func);
+    auto loop_test_bb = BasicBlock::Create(*llvm_context, "loop_test", func);
+    auto loop_body_bb = BasicBlock::Create(*llvm_context, "loop_body", func);
+    auto body_tail_bb =
+        BasicBlock::Create(*llvm_context, "loop_body_tail", func);
+    auto func_exit = BasicBlock::Create(*llvm_context, "func_exit", func);
+    auto struct_for_body_bb =
+        BasicBlock::Create(*llvm_context, "struct_for_body_body", func);
+
+    builder->CreateBr(loop_test_bb);
 
-    builder->CreateBr(test_bb);
     {
-      builder->SetInsertPoint(test_bb);
+      // loop_test:
+      //   if (loop_index < upper_bound)
+      //     goto loop_body;
+      //   else
+      //     goto func_exit
+
+      builder->SetInsertPoint(loop_test_bb);
       auto cond =
           builder->CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT,
                               builder->CreateLoad(loop_index), upper_bound);
-      builder->CreateCondBr(cond, body_bb, after_loop);
+      builder->CreateCondBr(cond, loop_body_bb, func_exit);
     }
 
-    builder->SetInsertPoint(body_bb);
+    // ***********************
+    // Begin loop_body_bb:
+    builder->SetInsertPoint(loop_body_bb);
 
     // initialize the coordinates
     auto refine =
@@ -1404,57 +1462,101 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
 
     if (snode->type == SNodeType::bitmasked ||
         snode->type == SNodeType::pointer) {
-      // test if current voxel is active or not
+      // test whether the current voxel is active or not
       auto is_active = call(snode, element.get("element"), "is_active",
                             {builder->CreateLoad(loop_index)});
       is_active =
           builder->CreateTrunc(is_active, llvm::Type::getInt1Ty(*llvm_context));
       exec_cond = builder->CreateAnd(exec_cond, is_active);
     }
 
-    auto body_bb_tail =
-        BasicBlock::Create(*llvm_context, "loop_body_tail", func);
+    builder->CreateCondBr(exec_cond, struct_for_body_bb, body_tail_bb);
+
     {
-      auto bounded_body_bb =
-          BasicBlock::Create(*llvm_context, "bound_guarded_loop_body", func);
-      builder->CreateCondBr(exec_cond, bounded_body_bb, body_bb_tail);
-      builder->SetInsertPoint(bounded_body_bb);
+      builder->SetInsertPoint(struct_for_body_bb);
 
-      // The real loop body
+      // The real loop body of the StructForStmt
       stmt->body->accept(this);
 
-      builder->CreateBr(body_bb_tail);
+      builder->CreateBr(body_tail_bb);
     }
 
-    // body cfg
+    {
+      // body tail: increment loop_index and jump to loop_test
+      builder->SetInsertPoint(body_tail_bb);
 
-    builder->SetInsertPoint(body_bb_tail);
+      if (spmd) {
+        create_increment(loop_index, block_dim);
+      } else {
+        create_increment(loop_index, tlctx->get_constant(1));
+      }
+      builder->CreateBr(loop_test_bb);
 
-    if (spmd) {
-      create_increment(loop_index, block_dim);
-    } else {
-      create_increment(loop_index, tlctx->get_constant(1));
+      builder->SetInsertPoint(func_exit);
     }
-    builder->CreateBr(test_bb);
-
-    builder->SetInsertPoint(after_loop);
 
     if (stmt->bls_epilogue) {
       call("block_barrier");  // "__syncthreads()"
       stmt->bls_epilogue->accept(this);
       call("block_barrier");  // "__syncthreads()"
     }
+
+    if (stmt->tls_epilogue) {
+      stmt->tls_epilogue->accept(this);
+    }
   }
 
   int list_element_size =
       std::min(leaf_block->max_num_elements(), taichi_listgen_max_element_size);
   int num_splits = std::max(1, list_element_size / stmt->block_dim);
-  // traverse leaf node
+
+  auto struct_for_func = get_runtime_function("parallel_struct_for");
+
+  if (arch_is_gpu(current_arch())) {
+    // Note that on CUDA local array allocation must have a compile-time
+    // constant size. Therefore, instead of passing in the tls_buffer_size
+    // argument, we directly clone the "parallel_struct_for" function and
+    // replace the "alignas(8) char tls_buffer[1]" statement with "alignas(8)
+    // char tls_buffer[tls_buffer_size]" at compile time.
+
+    auto value_map = llvm::ValueToValueMapTy();
+    auto patched_struct_for_func =
+        llvm::CloneFunction(struct_for_func, value_map);
+
+    int replaced_alloca_types = 0;
+
+    // Find the "1" in "char tls_buffer[1]" and replace it with
+    // "tls_buffer_size"
+    for (auto &bb : *patched_struct_for_func) {
+      for (llvm::Instruction &inst : bb) {
+        auto alloca = llvm::dyn_cast<AllocaInst>(&inst);
+        if (!alloca || alloca->getAlignment() != 8)
+          continue;
+        auto alloca_type = alloca->getAllocatedType();
+        auto char_type = llvm::Type::getInt8Ty(*llvm_context);
+        // Allocated type should be array [1 x i8]
+        if (alloca_type->isArrayTy() &&
+            alloca_type->getArrayNumElements() == 1 &&
+            alloca_type->getArrayElementType() == char_type) {
+          auto new_type = llvm::ArrayType::get(char_type, stmt->tls_size);
+          alloca->setAllocatedType(new_type);
+          replaced_alloca_types += 1;
+        }
+      }
+    }
+
+    // There should be **exactly** one replacement.
+    TI_ASSERT(replaced_alloca_types == 1);
+
+    struct_for_func = patched_struct_for_func;
+  }
+  // Loop over nodes in the element list, in parallel
   create_call(
-      "parallel_struct_for",
+      struct_for_func,
       {get_context(), tlctx->get_constant(leaf_block->id),
        tlctx->get_constant(list_element_size), tlctx->get_constant(num_splits),
-       body, tlctx->get_constant(stmt->num_cpu_threads)});
+       body, tlctx->get_constant(stmt->tls_size),
+       tlctx->get_constant(stmt->num_cpu_threads)});
   // TODO: why do we need num_cpu_threads on GPUs?
 }
 

diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
@@ -702,7 +702,9 @@ void TaichiLLVMContext::eliminate_unused_functions(
   TI_AUTO_PROF
   using namespace llvm;
   TI_ASSERT(module);
-  if (0) {  // temporary fix for now to make LLVM 8 work with CUDA
+  if (false) {
+    // temporary fix for now to make LLVM 8 work with CUDA
+    // TODO: recover this when it's time
     if (llvm::verifyModule(*module, &llvm::errs())) {
       TI_ERROR("Module broken\n");
     }