Skip to content

Commit

Permalink
[gui] GGUI 7.5/n: Avoid requiring CUDA toolchains to compile GGUI (ta…
Browse files Browse the repository at this point in the history
…ichi-dev#2821)

* fix windows

* fix setup.py

* resolve convo

* format
  • Loading branch information
AmesingFlank authored Aug 27, 2021
1 parent d71cee2 commit ab88e21
Show file tree
Hide file tree
Showing 20 changed files with 672 additions and 542 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
endforeach()

add_custom_target(
"generate_ui_interop_kernels_cuda"
COMMAND ${CLANG_EXECUTABLE} -S ui_kernels.cpp -o "ui_kernels_cuda.ll" -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
COMMAND ${LLVM_AS_EXECUTABLE} "ui_kernels_cuda.ll" -o "ui_kernels_cuda.bc"
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/ui/backends/vulkan"
)
add_dependencies(${CORE_LIBRARY_NAME} "generate_ui_interop_kernels_cuda")

FILE(WRITE ${CMAKE_CURRENT_LIST_DIR}/taichi/common/version.h
"#pragma once\n"
"#define TI_VERSION_MAJOR \"${TI_VERSION_MAJOR}\"\n"
Expand Down
8 changes: 3 additions & 5 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -62,24 +62,22 @@ file(GLOB TAICHI_VULKAN_SOURCE "taichi/backends/vulkan/*.h" "taichi/backends/vul
file(GLOB TAICHI_GGUI_SOURCE
"taichi/ui/*.cpp" "taichi/ui/*/*.cpp" "taichi/ui/*/*/*.cpp" "taichi/ui/*/*/*/*.cpp" "taichi/ui/*/*/*/*/*.cpp"
"taichi/ui/*.h" "taichi/ui/*/*.h" "taichi/ui/*/*/*.h" "taichi/ui/*/*/*/*.h" "taichi/ui/*/*/*/*/*.h"
"taichi/ui/backends/vulkan/renderables/kernels.cu"
)
list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_GGUI_SOURCE})


if(TI_WITH_GGUI)
add_definitions(-DTI_WITH_GGUI)

enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math -std=c++17" )
list(APPEND TAICHI_CORE_SOURCE ${TAICHI_GGUI_SOURCE})

include_directories(SYSTEM external/glm)

endif()



# These files are compiled into .bc and loaded as LLVM module dynamically. They should not be compiled into libtaichi. So they're removed here
file(GLOB BYTECODE_SOURCE "taichi/runtime/llvm/runtime.cpp" "taichi/runtime/llvm/ui_kernels.cpp")
list(REMOVE_ITEM TAICHI_CORE_SOURCE ${BYTECODE_SOURCE})


# These are required, regardless of whether Vulkan is enabled or not
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ def prepare_package(self):
print(f"Fetching runtime file {f} to {target} folder")
shutil.copy(os.path.join(llvm_runtime_dir, f), target)

ui_kernel_dir = 'taichi/ui/backends/vulkan'
for f in os.listdir(ui_kernel_dir):
if f.endswith('.bc'):
print(f"Fetching ui kernel file {f} to {target} folder")
shutil.copy(os.path.join(ui_kernel_dir, f), target)


setup(name=project_name,
packages=packages,
Expand Down
10 changes: 8 additions & 2 deletions taichi/backends/cuda/cuda_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,14 @@ void CUDAContext::launch(void *func,
// Make sure there are not too many threads for the device.
// Note that the CUDA random number generator does not allow more than
// [saturating_grid_dim * max_block_dim] threads.
TI_ASSERT(grid_dim <= get_current_program().config.saturating_grid_dim);
TI_ASSERT(block_dim <= get_current_program().config.max_block_dim);

// These asserts are currently remove so that when GGUI calls CUDA kernels,
// the grid and block dim won't be limited by the limits set by Program. With
// these limits, GGUI would have to use kernels with grid strided loops, which
// is harmful to performance. A simple example of rendering a bunny can drop
// from 2000FPS to 1000FPS because of this. TI_ASSERT(grid_dim <=
// get_current_program().config.saturating_grid_dim); TI_ASSERT(block_dim <=
// get_current_program().config.max_block_dim);

if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock);
Expand Down
1 change: 1 addition & 0 deletions taichi/backends/cuda/cuda_driver_functions.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ PER_CUDA_FUNCTION(stream_create, cuStreamCreate, void **, uint32);
// Memory management
PER_CUDA_FUNCTION(memcpy_host_to_device, cuMemcpyHtoD_v2, void *, void *, std::size_t);
PER_CUDA_FUNCTION(memcpy_device_to_host, cuMemcpyDtoH_v2, void *, void *, std::size_t);
PER_CUDA_FUNCTION(memcpy_device_to_device, cuMemcpyDtoD_v2, void *, void *, std::size_t);
PER_CUDA_FUNCTION(memcpy_host_to_device_async, cuMemcpyHtoDAsync_v2, void *, void *, std::size_t, void *);
PER_CUDA_FUNCTION(memcpy_device_to_host_async, cuMemcpyDtoHAsync_v2, void *, void *, std::size_t, void*);
PER_CUDA_FUNCTION(malloc, cuMemAlloc_v2, void **, std::size_t);
Expand Down
172 changes: 41 additions & 131 deletions taichi/backends/cuda/jit_cuda.cpp
Original file line number Diff line number Diff line change
@@ -1,139 +1,49 @@
#include <memory>

#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DynamicLibrary.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"

#include "taichi/backends/cuda/cuda_context.h"
#include "taichi/backends/cuda/cuda_driver.h"
#include "taichi/jit/jit_session.h"
#include "taichi/lang_util.h"
#include "taichi/program/program.h"
#include "taichi/system/timer.h"
#include "taichi/util/file_sequence_writer.h"

#define TI_RUNTIME_HOST
#include "taichi/program/context.h"
#undef TI_RUNTIME_HOST
#include "taichi/backends/cuda/jit_cuda.h"

TLANG_NAMESPACE_BEGIN

#if defined(TI_WITH_CUDA)
class JITModuleCUDA : public JITModule {
private:
void *module;

public:
explicit JITModuleCUDA(void *module) : module(module) {
}

void *lookup_function(const std::string &name) override {
// TODO: figure out why using the guard leads to wrong tests results
// auto context_guard = CUDAContext::get_instance().get_guard();
CUDAContext::get_instance().make_current();
void *func = nullptr;
auto t = Time::get_time();
auto err = CUDADriver::get_instance().module_get_function.call_with_warning(
&func, module, name.c_str());
if (err) {
TI_ERROR("Cannot look up function {}", name);
}
t = Time::get_time() - t;
TI_TRACE("CUDA module_get_function {} costs {} ms", name, t * 1000);
TI_ASSERT(func != nullptr);
return func;
}

void call(const std::string &name,
const std::vector<void *> &arg_pointers) override {
launch(name, 1, 1, 0, arg_pointers);
}

virtual void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t shared_mem_bytes,
const std::vector<void *> &arg_pointers) override {
auto func = lookup_function(name);
CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
block_dim, shared_mem_bytes);
}

bool direct_dispatch() const override {
return false;
}
};

class JITSessionCUDA : public JITSession {
public:
llvm::DataLayout data_layout;

explicit JITSessionCUDA(llvm::DataLayout data_layout)
: data_layout(data_layout) {
}

virtual JITModule *add_module(std::unique_ptr<llvm::Module> M,
int max_reg) override {
auto ptx = compile_module_to_ptx(M);
if (get_current_program().config.print_kernel_nvptx) {
static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
"module NVPTX");
writer.write(ptx);
}
// TODO: figure out why using the guard leads to wrong tests results
// auto context_guard = CUDAContext::get_instance().get_guard();
CUDAContext::get_instance().make_current();
// Create module for object
void *cuda_module;
TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
auto t = Time::get_time();
TI_TRACE("Loading module...");
[[maybe_unused]] auto &&_ =
std::move(CUDAContext::get_instance().get_lock_guard());

constexpr int max_num_options = 8;
int num_options = 0;
uint32 options[max_num_options];
void *option_values[max_num_options];

// Insert options
if (max_reg != 0) {
options[num_options] = CU_JIT_MAX_REGISTERS;
option_values[num_options] = &max_reg;
num_options++;
}

TI_ASSERT(num_options <= max_num_options);

CUDADriver::get_instance().module_load_data_ex(
&cuda_module, ptx.c_str(), num_options, options, option_values);
TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
// cudaModules.push_back(cudaModule);
modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
return modules.back().get();
}

virtual llvm::DataLayout get_data_layout() override {
return data_layout;
}

static std::string compile_module_to_ptx(
std::unique_ptr<llvm::Module> &module);
};
JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
int max_reg) {
auto ptx = compile_module_to_ptx(M);
if (get_current_program().config.print_kernel_nvptx) {
static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
"module NVPTX");
writer.write(ptx);
}
// TODO: figure out why using the guard leads to wrong tests results
// auto context_guard = CUDAContext::get_instance().get_guard();
CUDAContext::get_instance().make_current();
// Create module for object
void *cuda_module;
TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
auto t = Time::get_time();
TI_TRACE("Loading module...");
[[maybe_unused]] auto &&_ =
std::move(CUDAContext::get_instance().get_lock_guard());

constexpr int max_num_options = 8;
int num_options = 0;
uint32 options[max_num_options];
void *option_values[max_num_options];

// Insert options
if (max_reg != 0) {
options[num_options] = CU_JIT_MAX_REGISTERS;
option_values[num_options] = &max_reg;
num_options++;
}

TI_ASSERT(num_options <= max_num_options);

CUDADriver::get_instance().module_load_data_ex(
&cuda_module, ptx.c_str(), num_options, options, option_values);
TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
// cudaModules.push_back(cudaModule);
modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
return modules.back().get();
}

std::string cuda_mattrs() {
return "+ptx63";
Expand Down
104 changes: 104 additions & 0 deletions taichi/backends/cuda/jit_cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include <memory>

#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DynamicLibrary.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"

#include "taichi/backends/cuda/cuda_context.h"
#include "taichi/backends/cuda/cuda_driver.h"
#include "taichi/jit/jit_session.h"
#include "taichi/lang_util.h"
#include "taichi/program/program.h"
#include "taichi/system/timer.h"
#include "taichi/util/file_sequence_writer.h"

#define TI_RUNTIME_HOST
#include "taichi/program/context.h"
#undef TI_RUNTIME_HOST

TLANG_NAMESPACE_BEGIN

#if defined(TI_WITH_CUDA)
class JITModuleCUDA : public JITModule {
private:
void *module;

public:
explicit JITModuleCUDA(void *module) : module(module) {
}

void *lookup_function(const std::string &name) override {
// TODO: figure out why using the guard leads to wrong tests results
// auto context_guard = CUDAContext::get_instance().get_guard();
CUDAContext::get_instance().make_current();
void *func = nullptr;
auto t = Time::get_time();
auto err = CUDADriver::get_instance().module_get_function.call_with_warning(
&func, module, name.c_str());
if (err) {
TI_ERROR("Cannot look up function {}", name);
}
t = Time::get_time() - t;
TI_TRACE("CUDA module_get_function {} costs {} ms", name, t * 1000);
TI_ASSERT(func != nullptr);
return func;
}

void call(const std::string &name,
const std::vector<void *> &arg_pointers) override {
launch(name, 1, 1, 0, arg_pointers);
}

virtual void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t shared_mem_bytes,
const std::vector<void *> &arg_pointers) override {
auto func = lookup_function(name);
CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
block_dim, shared_mem_bytes);
}

bool direct_dispatch() const override {
return false;
}
};

class JITSessionCUDA : public JITSession {
public:
llvm::DataLayout data_layout;

explicit JITSessionCUDA(llvm::DataLayout data_layout)
: data_layout(data_layout) {
}

virtual JITModule *add_module(std::unique_ptr<llvm::Module> M,
int max_reg) override;

virtual llvm::DataLayout get_data_layout() override {
return data_layout;
}

static std::string compile_module_to_ptx(
std::unique_ptr<llvm::Module> &module);
};

#endif

std::unique_ptr<JITSession> create_llvm_jit_session_cuda(Arch arch);

TLANG_NAMESPACE_END
26 changes: 26 additions & 0 deletions taichi/inc/cuda_kernel_utils.inc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
extern "C" {

int thread_idx() {
return 0;
}

int warp_size() {
return 32;
}

int warp_idx() {
return thread_idx() % warp_size();
}

int block_idx() {
return 0;
}

int block_dim() {
return 0;
}

int grid_dim() {
return 0;
}
}
Loading

0 comments on commit ab88e21

Please sign in to comment.