Skip to content

Commit

Permalink
[build] Delete all llvm10 code and TI_LLVM_15 macro (taichi-dev#6685)
Browse files Browse the repository at this point in the history
  • Loading branch information
ailzhang authored Nov 21, 2022
1 parent b218fb5 commit 50de9bc
Show file tree
Hide file tree
Showing 16 changed files with 56 additions and 494 deletions.
1 change: 0 additions & 1 deletion .github/workflows/scripts/win_build.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ if ($llvmVer -eq "10") {
$env:LLVM_DIR = "$libsDir\taichi_llvm_15"
$env:TAICHI_CMAKE_ARGS += " -DCLANG_EXECUTABLE=$($libsDir -replace "\\", "\\")\\taichi_clang_15\\bin\\clang++.exe"
$env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=$($libsDir -replace "\\", "\\")\\taichi_llvm_15\\bin\\llvm-as.exe"
$env:TAICHI_CMAKE_ARGS += " -DTI_LLVM_15:BOOL=ON"
} else {
throw "Unsupported LLVM version"
}
Expand Down
6 changes: 0 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,6 @@ if (WIN32)
# MSVC in `Debug` config because MSVC would try to fill uninitialize memory
# with `0xCC` but it too breaks `LLVMTableGen` which is depended on by almost
# every component in LLVM.
#
# FIXME: (penguinliong) This is fixed in later releases of LLVM so maybe
# someday we can distribute `Debug` libraries, if it's ever needed.
if (NOT TI_LLVM_15)
SET(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
endif()
message("CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
endif()

Expand Down
1 change: 0 additions & 1 deletion ci/windows/win_build_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ if (!$llvmVer.CompareTo("10")) {
} else {
$env:TAICHI_CMAKE_ARGS += " -DCLANG_EXECUTABLE=C:\\taichi_clang_15\\bin\\clang++.exe"
$env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=C:\\taichi_llvm_15\\bin\\llvm-as.exe"
$env:TAICHI_CMAKE_ARGS += " -DTI_LLVM_15:BOOL=ON"
$env:TAICHI_CMAKE_ARGS += " -DTI_WITH_DX12:BOOL=ON"
}

Expand Down
7 changes: 0 additions & 7 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
option(USE_STDCPP "Use -stdlib=libc++" OFF)
option(TI_WITH_LLVM "Build with LLVM backends" ON)
option(TI_LLVM_15 "Switch to LLVM 15" ON)
option(TI_WITH_METAL "Build with the Metal backend" ON)
option(TI_WITH_CUDA "Build with the CUDA backend" ON)
option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
Expand Down Expand Up @@ -98,12 +97,6 @@ if(TI_WITH_LLVM)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_LLVM")
endif()

if (TI_LLVM_15)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_LLVM_15")
else()
set(TI_WITH_DX12 OFF)
endif()

## This version var is only used to locate slim_libdevice.10.bc
if(NOT CUDA_VERSION)
set(CUDA_VERSION 10.0)
Expand Down
8 changes: 0 additions & 8 deletions taichi/codegen/cpu/codegen_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,8 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {

{
builder->SetInsertPoint(loop_test_bb);
#ifdef TI_LLVM_15
auto *loop_index_load =
builder->CreateLoad(builder->getInt32Ty(), loop_index);
#else
auto *loop_index_load = builder->CreateLoad(loop_index);
#endif
auto cond = builder->CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLT, loop_index_load,
llvm_val[stmt->owned_num_local.find(stmt->major_from_type)
Expand All @@ -123,12 +119,8 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
auto &s = stmt->body->statements[i];
s->accept(this);
}
#ifdef TI_LLVM_15
auto *loop_index_load =
builder->CreateLoad(builder->getInt32Ty(), loop_index);
#else
auto *loop_index_load = builder->CreateLoad(loop_index);
#endif
builder->CreateStore(
builder->CreateAdd(loop_index_load, tlctx->get_constant(1)),
loop_index);
Expand Down
172 changes: 6 additions & 166 deletions taichi/codegen/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
auto value_arr = builder->CreateAlloca(stype);
for (int i = 0; i < values.size(); i++) {
auto value_ptr = builder->CreateGEP(
#ifdef TI_LLVM_15
stype,
#endif
value_arr, {tlctx->get_constant(0), tlctx->get_constant(i)});
stype, value_arr, {tlctx->get_constant(0), tlctx->get_constant(i)});
builder->CreateStore(values[i], value_ptr);
}
return LLVMModuleBuilder::call(
Expand Down Expand Up @@ -259,156 +256,6 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
llvm_val[stmt->val]);
}

// LLVM15 already support f16 atomic in
// https://github.com/llvm/llvm-project/commit/0cb08e448af7167ada767e0526aa44980e72ad08
// the review is at https://reviews.llvm.org/D52416
// Limit the f16 hack to LLVM10 build.
#ifndef TI_LLVM_15
// A huge hack for supporting f16 atomic add/max/min! Borrowed from
// https://github.com/tensorflow/tensorflow/blob/470d58a83470f8ede3beaa584e6992bc71b7baa6/tensorflow/compiler/xla/service/gpu/ir_emitter.cc#L378-L490
// The reason is that LLVM10 does not support generating atomicCAS for f16 on
// NVPTX backend.
//
// Implements atomic binary operations using atomic compare-and-swap
// (atomicCAS) as follows:
// 1. Reads the value from the memory pointed to by output_address and
// records it as old_output.
// 2. Uses old_output as one of the source operand to perform the binary
// operation and stores the result in new_output.
// 3. Calls atomicCAS which implements compare-and-swap as an atomic
// operation. In particular, atomicCAS reads the value from the memory
// pointed to by output_address, and compares the value with old_output.
// If the two values equal, new_output is written to the same memory
// location and true is returned to indicate that the atomic operation
// succeeds. Otherwise, the new value read from the memory is returned. In
// this case, the new value is copied to old_output, and steps 2. and 3.
// are repeated until atomicCAS succeeds.
//
// int32 is used for the atomicCAS operation. So atomicCAS reads and writes 32
// bit values from the memory, which is larger than the memory size required
// by the original atomic binary operation. We mask off the last two bits of
// the output_address and use the result as an address to read the 32 bit
// values from the memory.
//
// This can avoid out of bound memory accesses, based on the assumption:
// All buffers are 4 byte aligned and have a size of 4N.
//
// The pseudo code is shown below.
//
// cas_new_output_address = alloca(32);
// cas_old_output_address = alloca(32);
// atomic_address = output_address & ((int64)(-4));
// new_output_address = cas_new_output_address + (output_address & 3);
//
// *cas_old_output_address = *atomic_address;
// do {
// *cas_new_output_address = *cas_old_output_address;
// *new_output_address = operation(*new_output_address, *source_address);
// (*cas_old_output_address, success) =
// atomicCAS(atomic_address, *cas_old_output_address,
// *cas_new_output_address);
// } while (!success);
//

llvm::Value *atomic_op_using_cas(
llvm::Value *output_address,
llvm::Value *val,
std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op) override {
llvm::PointerType *output_address_type =
llvm::dyn_cast<llvm::PointerType>(output_address->getType());
TI_ASSERT(output_address_type != nullptr);

// element_type is the data type for the binary operation.
llvm::Type *element_type = output_address_type->getPointerElementType();
llvm::Type *element_address_type = element_type->getPointerTo();

int atomic_size = 32;
llvm::Type *atomic_type = builder->getIntNTy(atomic_size);
llvm::Type *atomic_address_type = atomic_type->getPointerTo(
output_address_type->getPointerAddressSpace());

// cas_old_output_address and cas_new_output_address point to the scratch
// memory where we store the old and new values for the repeated atomicCAS
// operations.
llvm::Value *cas_old_output_address =
builder->CreateAlloca(atomic_type, nullptr);
llvm::Value *cas_new_output_address =
builder->CreateAlloca(atomic_type, nullptr);

llvm::Value *atomic_memory_address;
// binop_output_address points to the scratch memory that stores the
// result of the binary operation.
llvm::Value *binop_output_address;

// Calculate bin_output_address output_address
llvm::Type *address_int_type =
module->getDataLayout().getIntPtrType(output_address_type);
atomic_memory_address =
builder->CreatePtrToInt(output_address, address_int_type);
llvm::Value *mask = llvm::ConstantInt::get(address_int_type, 3);
llvm::Value *offset = builder->CreateAnd(atomic_memory_address, mask);
mask = llvm::ConstantInt::get(address_int_type, -4);
atomic_memory_address = builder->CreateAnd(atomic_memory_address, mask);
atomic_memory_address =
builder->CreateIntToPtr(atomic_memory_address, atomic_address_type);
binop_output_address = builder->CreateAdd(
builder->CreatePtrToInt(cas_new_output_address, address_int_type),
offset);
binop_output_address =
builder->CreateIntToPtr(binop_output_address, element_address_type);

// Use the value from the memory that atomicCAS operates on to initialize
// cas_old_output.
llvm::Value *cas_old_output =
builder->CreateLoad(atomic_memory_address, "cas_old_output");
builder->CreateStore(cas_old_output, cas_old_output_address);

llvm::BasicBlock *loop_body_bb =
BasicBlock::Create(*llvm_context, "atomic_op_loop_body", func);
llvm::BasicBlock *loop_exit_bb =
BasicBlock::Create(*llvm_context, "loop_exit_bb", func);
builder->CreateBr(loop_body_bb);
builder->SetInsertPoint(loop_body_bb);

// loop body for one atomicCAS
{
// Use cas_old_output to initialize cas_new_output.
cas_old_output =
builder->CreateLoad(cas_old_output_address, "cas_old_output");
builder->CreateStore(cas_old_output, cas_new_output_address);

auto binop_output = op(builder->CreateLoad(binop_output_address), val);
builder->CreateStore(binop_output, binop_output_address);

llvm::Value *cas_new_output =
builder->CreateLoad(cas_new_output_address, "cas_new_output");

// Emit code to perform the atomicCAS operation
// (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
// cas_new_output);
llvm::Value *ret_value = builder->CreateAtomicCmpXchg(
atomic_memory_address, cas_old_output, cas_new_output,
llvm::AtomicOrdering::SequentiallyConsistent,
llvm::AtomicOrdering::SequentiallyConsistent);

// Extract the memory value returned from atomicCAS and store it as
// cas_old_output.
builder->CreateStore(
builder->CreateExtractValue(ret_value, 0, "cas_old_output"),
cas_old_output_address);
// Extract the success bit returned from atomicCAS and generate a
// conditional branch on the success bit.
builder->CreateCondBr(
builder->CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
loop_body_bb);
}

builder->SetInsertPoint(loop_exit_bb);

return output_address;
}
#endif

void visit(RangeForStmt *for_stmt) override {
create_naive_range_for(for_stmt);
}
Expand Down Expand Up @@ -475,11 +322,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
builder->SetInsertPoint(loop_test_bb);
auto cond = builder->CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLT,
builder->CreateLoad(
#ifdef TI_LLVM_15
i32_ty,
#endif
loop_index),
builder->CreateLoad(i32_ty, loop_index),
llvm_val[stmt->owned_num_local.find(stmt->major_from_type)
->second]);
builder->CreateCondBr(cond, loop_body_bb, func_exit);
Expand All @@ -492,13 +335,10 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
auto &s = stmt->body->statements[i];
s->accept(this);
}
builder->CreateStore(builder->CreateAdd(builder->CreateLoad(
#ifdef TI_LLVM_15
i32_ty,
#endif
loop_index),
block_dim),
loop_index);
builder->CreateStore(
builder->CreateAdd(builder->CreateLoad(i32_ty, loop_index),
block_dim),
loop_index);
builder->CreateBr(loop_test_bb);
builder->SetInsertPoint(func_exit);
}
Expand Down
17 changes: 5 additions & 12 deletions taichi/codegen/dx12/codegen_dx12.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
builder->SetInsertPoint(loop_test_bb);
auto cond = builder->CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLT,
builder->CreateLoad(
#ifdef TI_LLVM_15
i32_ty,
#endif
loop_index),
builder->CreateLoad(i32_ty, loop_index),
llvm_val[stmt->owned_num_local.find(stmt->major_from_type)
->second]);
builder->CreateCondBr(cond, loop_body_bb, func_exit);
Expand All @@ -108,13 +104,10 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
auto &s = stmt->body->statements[i];
s->accept(this);
}
builder->CreateStore(builder->CreateAdd(builder->CreateLoad(
#ifdef TI_LLVM_15
i32_ty,
#endif
loop_index),
block_dim),
loop_index);
builder->CreateStore(
builder->CreateAdd(builder->CreateLoad(i32_ty, loop_index),
block_dim),
loop_index);
builder->CreateBr(loop_test_bb);
builder->SetInsertPoint(func_exit);
}
Expand Down
Loading

0 comments on commit 50de9bc

Please sign in to comment.