[refactor] [llvm] Allocate the runtime context for real functions on …

…the stack (taichi-dev#7971) Issue: # ### Brief Summary  ### <samp>🤖 Generated by Copilot at a4f1255</samp> This pull request removes the `gc_rc` task type and the runtime context buffer from the codebase. These were used to perform garbage collection on the memory pool that stored the runtime context for each offloaded task. However, they are no longer needed as the runtime context is now allocated on the stack instead of the heap. This simplifies the codegen and runtime logic and reduces the memory and performance overhead of the LLVM and CUDA backends. ### Walkthrough  ### <samp>🤖 Generated by Copilot at a4f1255</samp> * Remove the gc_rc task type and its related code, as it is no longer needed for the runtime context management ([link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-331e185d09584e49150b1695bf6ecc1c3b558b805075984750bb94f4de508975L7), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-05e2a2d0a9c9879a4fb5fde9baf5a43738c7601fc53e234a40ab9bc27d1512a5L321-L322), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-917d9436dcaafa0f1e41ae9bad90273a303f036f00da94e417788a7fa1dc5260L1363-R1363), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-77422b8748a46e70519217be594cd28433edadf98ca4960ce116f85da8dbccc3L658-L659), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-d47b571f975c1002b8cb93634ac2a3d5f090f3fa9676ec3e0004c2ec4116ee21L696-L702)) * Remove the codegen functions and branches for the gc_rc task type in the CPU, CUDA, and LLVM backends (`codegen_cpu.cpp`, `codegen_cuda.cpp`, `codegen_llvm.cpp`, `codegen_llvm.h`) ([link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-b6e86fbdf536db46b321f67942f66d809c213a4142ceb9f5f81d016684c2d5c8L190-L191), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-50537ad5ea3b900c0d55a088f3cc285986340ad68c9b96fea481187c4dce49eaL559-L588), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-50537ad5ea3b900c0d55a088f3cc285986340ad68c9b96fea481187c4dce49eaL633-L634), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-3c663c78745adcd3f6a7ac81fe99e628decc3040f292ea1e20ecd4b85a7f4313L1134-L1137), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-3c663c78745adcd3f6a7ac81fe99e628decc3040f292ea1e20ecd4b85a7f4313L2776-R2772), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-3c663c78745adcd3f6a7ac81fe99e628decc3040f292ea1e20ecd4b85a7f4313L2795), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-aebc3d71bb555fba77f1d303d5c29ac7e07d392440b0e54cf556ff5a10a81d0aL167)) * Modify the codegen function for the func_call task type in the LLVM backend to allocate the runtime context on the stack instead of the heap (`codegen_llvm.cpp`) ([link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-3c663c78745adcd3f6a7ac81fe99e628decc3040f292ea1e20ecd4b85a7f4313L2776-R2772), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-3c663c78745adcd3f6a7ac81fe99e628decc3040f292ea1e20ecd4b85a7f4313L2795)) * Remove the initialization and management of the runtime context buffer in the LLVM runtime executor and module (`llvm_runtime_executor.cpp`, `runtime.cpp`) ([link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-b9155792159f392bd8bacd44cb1819be5239b022d707499fc364c0f93dd8c5e5L700-L703), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L561), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L869-L877), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L955-L959), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L1670-L1673), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L1705-L1709), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L1727-L1731), [link](https://github.com/taichi-dev/taichi/pull/7971/files?diff=unified&w=0#diff-980b2254ce0f4c654a946673ab6cd7a84f78cc6f0d6560bc1361670ec6e678c4L1775-R1746))
ares201005 · May 10, 2023 · c3e1d3f · c3e1d3f
1 parent 89db415
commit c3e1d3f
Show file tree

Hide file tree

Showing 11 changed files with 2 additions and 93 deletions.
diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -187,8 +187,6 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
       emit_list_gen(stmt);
     } else if (stmt->task_type == Type::gc) {
       emit_gc(stmt);
-    } else if (stmt->task_type == Type::gc_rc) {
-      emit_gc_rc();
     } else {
       TI_NOT_IMPLEMENTED
     }

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -556,36 +556,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     }
   }
 
-  void emit_cuda_gc_rc(OffloadedStmt *stmt) {
-    {
-      init_offloaded_task_function(stmt, "gather_list");
-      call("gc_rc_parallel_0", get_context());
-      finalize_offloaded_task_function();
-      current_task->grid_dim = compile_config.saturating_grid_dim;
-      current_task->block_dim = 64;
-      offloaded_tasks.push_back(*current_task);
-      current_task = nullptr;
-    }
-    {
-      init_offloaded_task_function(stmt, "reinit_lists");
-      call("gc_rc_parallel_1", get_context());
-      finalize_offloaded_task_function();
-      current_task->grid_dim = 1;
-      current_task->block_dim = 1;
-      offloaded_tasks.push_back(*current_task);
-      current_task = nullptr;
-    }
-    {
-      init_offloaded_task_function(stmt, "zero_fill");
-      call("gc_rc_parallel_2", get_context());
-      finalize_offloaded_task_function();
-      current_task->grid_dim = compile_config.saturating_grid_dim;
-      current_task->block_dim = 64;
-      offloaded_tasks.push_back(*current_task);
-      current_task = nullptr;
-    }
-  }
-
   bool kernel_argument_by_val() const override {
     return true;  // on CUDA, pass the argument by value
   }
@@ -630,8 +600,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     if (stmt->task_type == Type::gc) {
       // gc has 3 kernels, so we treat it specially
       emit_cuda_gc(stmt);
-    } else if (stmt->task_type == Type::gc_rc) {
-      emit_cuda_gc_rc(stmt);
     } else {
       init_offloaded_task_function(stmt);
       if (stmt->task_type == Type::serial) {

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1131,10 +1131,6 @@ void TaskCodeGenLLVM::emit_gc(OffloadedStmt *stmt) {
   call("node_gc", get_runtime(), tlctx->get_constant(snode));
 }
 
-void TaskCodeGenLLVM::emit_gc_rc() {
-  call("runtime_context_gc", get_runtime());
-}
-
 void TaskCodeGenLLVM::create_increment(llvm::Value *ptr, llvm::Value *value) {
   auto original_value = builder->CreateLoad(value->getType(), ptr);
   builder->CreateStore(builder->CreateAdd(original_value, value), ptr);
@@ -2773,7 +2769,7 @@ void TaskCodeGenLLVM::visit(FuncCallStmt *stmt) {
     current_callable = old_callable;
   }
   llvm::Function *llvm_func = func_map[stmt->func];
-  auto *new_ctx = call("allocate_runtime_context", get_runtime());
+  auto *new_ctx = builder->CreateAlloca(get_runtime_type("RuntimeContext"));
   call("RuntimeContext_set_runtime", new_ctx, get_runtime());
   if (!stmt->func->parameter_list.empty()) {
     auto *buffer =
@@ -2792,7 +2788,6 @@ void TaskCodeGenLLVM::visit(FuncCallStmt *stmt) {
   }
   call(llvm_func, new_ctx);
   llvm_val[stmt] = result_buffer;
-  call("recycle_runtime_context", get_runtime(), new_ctx);
 }
 
 void TaskCodeGenLLVM::visit(GetElementStmt *stmt) {

diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h
@@ -164,7 +164,6 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
   void emit_list_gen(OffloadedStmt *listgen);
 
   void emit_gc(OffloadedStmt *stmt);
-  void emit_gc_rc();
 
   llvm::Value *call(SNode *snode,
                     llvm::Value *node_ptr,

diff --git a/taichi/inc/offloaded_task_type.inc.h b/taichi/inc/offloaded_task_type.inc.h
@@ -4,4 +4,3 @@ PER_TASK_TYPE(struct_for)
 PER_TASK_TYPE(mesh_for)
 PER_TASK_TYPE(listgen)
 PER_TASK_TYPE(gc)
-PER_TASK_TYPE(gc_rc)
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
@@ -318,8 +318,6 @@ std::string OffloadedStmt::task_name() const {
   } else if (task_type == TaskType::gc) {
     TI_ASSERT(snode);
     return fmt::format("gc_{}", snode->name);
-  } else if (task_type == TaskType::gc_rc) {
-    return fmt::format("gc_rc");
   } else {
     TI_NOT_IMPLEMENTED
   }

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -1360,8 +1360,7 @@ class OffloadedStmt : public Stmt {
   static std::string task_type_name(TaskType tt);
 
   bool has_body() const {
-    return task_type != TaskType::listgen && task_type != TaskType::gc &&
-           task_type != TaskType::gc_rc;
+    return task_type != TaskType::listgen && task_type != TaskType::gc;
   }
 
   bool is_container_statement() const override {

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -697,10 +697,6 @@ void LlvmRuntimeExecutor::materialize_runtime(KernelProfilerBase *profiler,
         "LLVMRuntime_set_profiler_stop", llvm_runtime_,
         (void *)&KernelProfilerBase::profiler_stop);
   }
-  if (arch_is_cpu(config_.arch) || config_.arch == Arch::cuda) {
-    runtime_jit->call<void *>("runtime_initialize_runtime_context_buffer",
-                              llvm_runtime_);
-  }
 }
 
 void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) {

diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -558,7 +558,6 @@ struct LLVMRuntime {
   parallel_for_type parallel_for;
   ListManager *element_lists[taichi_max_num_snodes];
   NodeManager *node_allocators[taichi_max_num_snodes];
-  NodeManager *runtime_context_buffer_allocator;
   Ptr ambient_elements[taichi_max_num_snodes];
   Ptr temporaries;
   RandState *rand_states;
@@ -866,15 +865,6 @@ Ptr LLVMRuntime::allocate_from_reserved_memory(
   return ret;
 }
 
-RuntimeContext *allocate_runtime_context(LLVMRuntime *runtime) {
-  return (RuntimeContext *)
-      runtime->runtime_context_buffer_allocator->allocate();
-}
-
-void recycle_runtime_context(LLVMRuntime *runtime, RuntimeContext *ptr) {
-  runtime->runtime_context_buffer_allocator->recycle((Ptr)ptr);
-}
-
 // External API
 // [ON HOST] CPU backend
 // [ON DEVICE] CUDA/AMDGPU backend
@@ -952,11 +942,6 @@ void runtime_initialize_memory(LLVMRuntime *runtime,
   }
 }
 
-void runtime_initialize_runtime_context_buffer(LLVMRuntime *runtime) {
-  runtime->runtime_context_buffer_allocator =
-      runtime->create<NodeManager>(runtime, sizeof(RuntimeContext), 4096);
-}
-
 void runtime_initialize_rand_states_cuda(LLVMRuntime *runtime,
                                          int starting_rand_state) {
   int i = block_dim() * block_idx() + thread_idx();
@@ -1667,10 +1652,6 @@ void node_gc(LLVMRuntime *runtime, int snode_id) {
   runtime->node_allocators[snode_id]->gc_serial();
 }
 
-void runtime_context_gc(LLVMRuntime *runtime) {
-  runtime->runtime_context_buffer_allocator->gc_serial();
-}
-
 void gc_parallel_impl_0(RuntimeContext *context, NodeManager *allocator) {
   auto free_list = allocator->free_list;
   auto free_list_size = free_list->size();
@@ -1702,11 +1683,6 @@ void gc_parallel_0(RuntimeContext *context, int snode_id) {
   gc_parallel_impl_0(context, runtime->node_allocators[snode_id]);
 }
 
-void gc_rc_parallel_0(RuntimeContext *context) {
-  LLVMRuntime *runtime = context->runtime;
-  gc_parallel_impl_0(context, runtime->runtime_context_buffer_allocator);
-}
-
 void gc_parallel_impl_1(NodeManager *allocator) {
   auto free_list = allocator->free_list;
 
@@ -1724,11 +1700,6 @@ void gc_parallel_1(RuntimeContext *context, int snode_id) {
   gc_parallel_impl_1(runtime->node_allocators[snode_id]);
 }
 
-void gc_rc_parallel_1(RuntimeContext *context) {
-  LLVMRuntime *runtime = context->runtime;
-  gc_parallel_impl_1(runtime->runtime_context_buffer_allocator);
-}
-
 void gc_parallel_impl_2(NodeManager *allocator) {
   auto elements = allocator->recycle_list_size_backup;
   auto free_list = allocator->free_list;
@@ -1772,11 +1743,6 @@ void gc_parallel_2(RuntimeContext *context, int snode_id) {
   LLVMRuntime *runtime = context->runtime;
   gc_parallel_impl_2(runtime->node_allocators[snode_id]);
 }
-
-void gc_rc_parallel_2(RuntimeContext *context) {
-  LLVMRuntime *runtime = context->runtime;
-  gc_parallel_impl_2(runtime->runtime_context_buffer_allocator);
-}
 }
 
 extern "C" {

diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
@@ -655,8 +655,6 @@ class IRPrinter : public IRVisitor {
     } else if (stmt->task_type == OffloadedTaskType::gc) {
       print("{} = offloaded garbage collect {}", stmt->name(),
             stmt->snode->get_node_type_name_hinted());
-    } else if (stmt->task_type == OffloadedTaskType::gc_rc) {
-      print("{} = offloaded garbage collect runtime context", stmt->name());
     } else {
       print("{} = offloaded {} ", stmt->name(), details);
       if (stmt->tls_prologue) {

diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
@@ -693,13 +693,6 @@ void insert_gc(IRNode *root, const CompileConfig &config) {
       }
     }
   }
-  if (!irpass::analysis::gather_statements(root, [](Stmt *stmt) {
-         return stmt->is<FuncCallStmt>();
-       }).empty()) {
-    auto gc_task = Stmt::make_typed<OffloadedStmt>(
-        OffloadedStmt::TaskType::gc_rc, config.arch);
-    b->insert(std::move(gc_task));
-  }
 }
 
 class AssociateContinueScope : public BasicStmtVisitor {