Skip to content

Commit

Permalink
[Perf] Support thread local storage for reduction in struct-fors (tai…
Browse files Browse the repository at this point in the history
…chi-dev#1941)

Co-authored-by: Ye Kuang <[email protected]>
  • Loading branch information
yuanming-hu and k-ye authored Oct 12, 2020
1 parent 183e947 commit 2ccf0f5
Show file tree
Hide file tree
Showing 12 changed files with 183 additions and 72 deletions.
13 changes: 8 additions & 5 deletions misc/async_mgpcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)

# grid parameters
N = 128
N = 256

n_mg_levels = 5
pre_and_post_smoothing = 2
Expand All @@ -40,12 +40,15 @@
new_zTr = ti.field(dtype=real, shape=())
pAp = ti.field(dtype=real, shape=())

grid = ti.root.pointer(ti.ijk, [N_tot // 4]).dense(ti.ijk, 4).place(x, p, Ap)
leaf_size = 8

grid = ti.root.pointer(ti.ijk,
[N_tot // leaf_size]).dense(ti.ijk,
leaf_size).place(x, p, Ap)

for l in range(n_mg_levels):
grid = ti.root.pointer(ti.ijk,
[N_tot // (4 * 2**l)]).dense(ti.ijk,
4).place(r[l], z[l])
grid = ti.root.pointer(ti.ijk, [N_tot // (leaf_size * 2**l)]).dense(
ti.ijk, leaf_size).place(r[l], z[l])

ti.root.place(alpha, beta, sum)

Expand Down
2 changes: 1 addition & 1 deletion python/taichi/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_rel_eps():


def approx(expected, **kwargs):
'''Tweaked pytest.approx for OpenGL low percisions'''
'''Tweaked pytest.approx for OpenGL low precisions'''
import pytest

class boolean_integer:
Expand Down
3 changes: 2 additions & 1 deletion taichi/backends/cpu/codegen_cpu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "codegen_cpu.h"
#include "taichi/backends/cpu/codegen_cpu.h"

#include "taichi/codegen/codegen_llvm.h"
#include "taichi/common/core.h"
Expand Down Expand Up @@ -30,6 +30,7 @@ class CodeGenLLVMCPU : public CodeGenLLVM {

auto *tls_prologue = create_xlogue(stmt->tls_prologue);

// The loop body
llvm::Function *body;
{
auto guard = get_function_creation_guard(
Expand Down
180 changes: 141 additions & 39 deletions taichi/codegen/codegen_llvm.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "codegen_llvm.h"
#include "taichi/codegen/codegen_llvm.h"

#include "taichi/struct/struct_llvm.h"
#include "taichi/util/file_sequence_writer.h"
Expand Down Expand Up @@ -44,8 +44,8 @@ FunctionCreationGuard::FunctionCreationGuard(
llvm::Type::getVoidTy(*mb->llvm_context), arguments, false);

body = llvm::Function::Create(body_function_type,
llvm::Function::InternalLinkage, "loop_body",
mb->module.get());
llvm::Function::InternalLinkage,
"function_body", mb->module.get());
old_func = mb->func;
// emit into loop body function
mb->func = body;
Expand All @@ -59,7 +59,8 @@ FunctionCreationGuard::FunctionCreationGuard(
ip = mb->builder->saveIP();
mb->builder->SetInsertPoint(entry);

auto body_bb = BasicBlock::Create(*mb->llvm_context, "loop_body", mb->func);
auto body_bb =
BasicBlock::Create(*mb->llvm_context, "function_body", mb->func);
mb->builder->CreateBr(body_bb);
mb->builder->SetInsertPoint(body_bb);
}
Expand Down Expand Up @@ -1313,37 +1314,82 @@ std::tuple<llvm::Value *, llvm::Value *> CodeGenLLVM::get_range_for_bounds(
}

void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
llvm::Function *body;
// TODO: instead of constructing tons of LLVM IR, writing the logic in
// runtime.cpp may be a cleaner solution. See
// CodeGenLLVMCPU::create_offload_range_for as an example.

llvm::Function *body = nullptr;
auto leaf_block = stmt->snode;
{
// Create the loop body function
auto guard = get_function_creation_guard({
llvm::PointerType::get(get_runtime_type("Context"), 0),
get_tls_buffer_type(),
llvm::PointerType::get(get_runtime_type("Element"), 0),
tlctx->get_data_type<int>(),
tlctx->get_data_type<int>(),
});

body = guard.body;

// per-leaf-block for loop
/* Function structure:
*
* function_body (entry):
* loop_index = lower_bound;
* tls_prologue()
* bls_prologue()
* goto loop_test
*
* loop_test:
* if (loop_index < upper_bound)
* goto loop_body
* else
* goto func_exit
*
* loop_body:
* initialize_coordinates()
* if (bitmasked voxel is active)
* goto struct_for_body
* else
* goto loop_body_tail
*
* struct_for_body:
* ... (Run codegen on the StructForStmt::body Taichi Block)
* goto loop_body_tail
*
* loop_body_tail:
* loop_index += block_dim
* goto loop_test
*
* func_exit:
* bls_epilogue()
* tls_epilogue()
* return
*/

auto loop_index =
create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));

llvm::Value *thread_idx = nullptr, *block_dim = nullptr;
RuntimeObject element("Element", this, builder.get(), get_arg(2));

RuntimeObject element("Element", this, builder.get(), get_arg(1));
auto lower_bound = get_arg(2);
auto upper_bound = get_arg(3);
// Loop ranges
auto lower_bound = get_arg(3);
auto upper_bound = get_arg(4);

parent_coordinates = element.get_ptr("pcoord");

if (stmt->tls_prologue) {
stmt->tls_prologue->accept(this);
}

if (stmt->bls_prologue) {
call("block_barrier"); // "__syncthreads()"
stmt->bls_prologue->accept(this);
call("block_barrier"); // "__syncthreads()"
}

llvm::Value *thread_idx = nullptr, *block_dim = nullptr;

if (spmd) {
thread_idx =
builder->CreateIntrinsic(Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {});
Expand All @@ -1355,21 +1401,33 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
builder->CreateStore(lower_bound, loop_index);
}

// test bb
auto test_bb = BasicBlock::Create(*llvm_context, "test", func);
auto body_bb = BasicBlock::Create(*llvm_context, "loop_body", func);
auto after_loop = BasicBlock::Create(*llvm_context, "after_loop", func);
auto loop_test_bb = BasicBlock::Create(*llvm_context, "loop_test", func);
auto loop_body_bb = BasicBlock::Create(*llvm_context, "loop_body", func);
auto body_tail_bb =
BasicBlock::Create(*llvm_context, "loop_body_tail", func);
auto func_exit = BasicBlock::Create(*llvm_context, "func_exit", func);
auto struct_for_body_bb =
BasicBlock::Create(*llvm_context, "struct_for_body_body", func);

builder->CreateBr(loop_test_bb);

builder->CreateBr(test_bb);
{
builder->SetInsertPoint(test_bb);
// loop_test:
// if (loop_index < upper_bound)
// goto loop_body;
// else
// goto func_exit

builder->SetInsertPoint(loop_test_bb);
auto cond =
builder->CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT,
builder->CreateLoad(loop_index), upper_bound);
builder->CreateCondBr(cond, body_bb, after_loop);
builder->CreateCondBr(cond, loop_body_bb, func_exit);
}

builder->SetInsertPoint(body_bb);
// ***********************
// Begin loop_body_bb:
builder->SetInsertPoint(loop_body_bb);

// initialize the coordinates
auto refine =
Expand Down Expand Up @@ -1404,57 +1462,101 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {

if (snode->type == SNodeType::bitmasked ||
snode->type == SNodeType::pointer) {
// test if current voxel is active or not
// test whether the current voxel is active or not
auto is_active = call(snode, element.get("element"), "is_active",
{builder->CreateLoad(loop_index)});
is_active =
builder->CreateTrunc(is_active, llvm::Type::getInt1Ty(*llvm_context));
exec_cond = builder->CreateAnd(exec_cond, is_active);
}

auto body_bb_tail =
BasicBlock::Create(*llvm_context, "loop_body_tail", func);
builder->CreateCondBr(exec_cond, struct_for_body_bb, body_tail_bb);

{
auto bounded_body_bb =
BasicBlock::Create(*llvm_context, "bound_guarded_loop_body", func);
builder->CreateCondBr(exec_cond, bounded_body_bb, body_bb_tail);
builder->SetInsertPoint(bounded_body_bb);
builder->SetInsertPoint(struct_for_body_bb);

// The real loop body
// The real loop body of the StructForStmt
stmt->body->accept(this);

builder->CreateBr(body_bb_tail);
builder->CreateBr(body_tail_bb);
}

// body cfg
{
// body tail: increment loop_index and jump to loop_test
builder->SetInsertPoint(body_tail_bb);

builder->SetInsertPoint(body_bb_tail);
if (spmd) {
create_increment(loop_index, block_dim);
} else {
create_increment(loop_index, tlctx->get_constant(1));
}
builder->CreateBr(loop_test_bb);

if (spmd) {
create_increment(loop_index, block_dim);
} else {
create_increment(loop_index, tlctx->get_constant(1));
builder->SetInsertPoint(func_exit);
}
builder->CreateBr(test_bb);

builder->SetInsertPoint(after_loop);

if (stmt->bls_epilogue) {
call("block_barrier"); // "__syncthreads()"
stmt->bls_epilogue->accept(this);
call("block_barrier"); // "__syncthreads()"
}

if (stmt->tls_epilogue) {
stmt->tls_epilogue->accept(this);
}
}

int list_element_size =
std::min(leaf_block->max_num_elements(), taichi_listgen_max_element_size);
int num_splits = std::max(1, list_element_size / stmt->block_dim);
// traverse leaf node

auto struct_for_func = get_runtime_function("parallel_struct_for");

if (arch_is_gpu(current_arch())) {
// Note that on CUDA local array allocation must have a compile-time
// constant size. Therefore, instead of passing in the tls_buffer_size
// argument, we directly clone the "parallel_struct_for" function and
// replace the "alignas(8) char tls_buffer[1]" statement with "alignas(8)
// char tls_buffer[tls_buffer_size]" at compile time.

auto value_map = llvm::ValueToValueMapTy();
auto patched_struct_for_func =
llvm::CloneFunction(struct_for_func, value_map);

int replaced_alloca_types = 0;

// Find the "1" in "char tls_buffer[1]" and replace it with
// "tls_buffer_size"
for (auto &bb : *patched_struct_for_func) {
for (llvm::Instruction &inst : bb) {
auto alloca = llvm::dyn_cast<AllocaInst>(&inst);
if (!alloca || alloca->getAlignment() != 8)
continue;
auto alloca_type = alloca->getAllocatedType();
auto char_type = llvm::Type::getInt8Ty(*llvm_context);
// Allocated type should be array [1 x i8]
if (alloca_type->isArrayTy() &&
alloca_type->getArrayNumElements() == 1 &&
alloca_type->getArrayElementType() == char_type) {
auto new_type = llvm::ArrayType::get(char_type, stmt->tls_size);
alloca->setAllocatedType(new_type);
replaced_alloca_types += 1;
}
}
}

// There should be **exactly** one replacement.
TI_ASSERT(replaced_alloca_types == 1);

struct_for_func = patched_struct_for_func;
}
// Loop over nodes in the element list, in parallel
create_call(
"parallel_struct_for",
struct_for_func,
{get_context(), tlctx->get_constant(leaf_block->id),
tlctx->get_constant(list_element_size), tlctx->get_constant(num_splits),
body, tlctx->get_constant(stmt->num_cpu_threads)});
body, tlctx->get_constant(stmt->tls_size),
tlctx->get_constant(stmt->num_cpu_threads)});
// TODO: why do we need num_cpu_threads on GPUs?
}

Expand Down
4 changes: 3 additions & 1 deletion taichi/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,9 @@ void TaichiLLVMContext::eliminate_unused_functions(
TI_AUTO_PROF
using namespace llvm;
TI_ASSERT(module);
if (0) { // temporary fix for now to make LLVM 8 work with CUDA
if (false) {
// temporary fix for now to make LLVM 8 work with CUDA
// TODO: recover this when it's time
if (llvm::verifyModule(*module, &llvm::errs())) {
TI_ERROR("Module broken\n");
}
Expand Down
Loading

0 comments on commit 2ccf0f5

Please sign in to comment.