[gui] GGUI 7.5/n: Avoid requiring CUDA toolchains to compile GGUI (ta…

…ichi-dev#2821) * fix windows * fix setup.py * resolve convo * format
zerolyj · Aug 27, 2021 · ab88e21 · ab88e21
1 parent d71cee2
commit ab88e21
Show file tree

Hide file tree

Showing 20 changed files with 672 additions and 542 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -101,6 +101,14 @@ foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
   add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
 endforeach()
 
+add_custom_target(
+      "generate_ui_interop_kernels_cuda"
+      COMMAND ${CLANG_EXECUTABLE} -S ui_kernels.cpp -o "ui_kernels_cuda.ll" -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
+      COMMAND ${LLVM_AS_EXECUTABLE} "ui_kernels_cuda.ll" -o "ui_kernels_cuda.bc"
+      WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/ui/backends/vulkan"
+)
+add_dependencies(${CORE_LIBRARY_NAME} "generate_ui_interop_kernels_cuda")
+
 FILE(WRITE ${CMAKE_CURRENT_LIST_DIR}/taichi/common/version.h
         "#pragma once\n"
         "#define TI_VERSION_MAJOR \"${TI_VERSION_MAJOR}\"\n"

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
@@ -62,24 +62,22 @@ file(GLOB TAICHI_VULKAN_SOURCE "taichi/backends/vulkan/*.h" "taichi/backends/vul
 file(GLOB TAICHI_GGUI_SOURCE
     "taichi/ui/*.cpp"  "taichi/ui/*/*.cpp" "taichi/ui/*/*/*.cpp"  "taichi/ui/*/*/*/*.cpp" "taichi/ui/*/*/*/*/*.cpp"
     "taichi/ui/*.h"  "taichi/ui/*/*.h" "taichi/ui/*/*/*.h"  "taichi/ui/*/*/*/*.h" "taichi/ui/*/*/*/*/*.h"
-    "taichi/ui/backends/vulkan/renderables/kernels.cu"
 )
 list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_GGUI_SOURCE})
 
 
 if(TI_WITH_GGUI)
     add_definitions(-DTI_WITH_GGUI)
 
-    enable_language(CUDA)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math -std=c++17" )
     list(APPEND TAICHI_CORE_SOURCE ${TAICHI_GGUI_SOURCE})
 
     include_directories(SYSTEM external/glm)
 
 endif()
 
-
-
+# These files are compiled into .bc and loaded as LLVM module dynamically. They should not be compiled into libtaichi. So they're removed here
+file(GLOB BYTECODE_SOURCE "taichi/runtime/llvm/runtime.cpp" "taichi/runtime/llvm/ui_kernels.cpp")
+list(REMOVE_ITEM TAICHI_CORE_SOURCE ${BYTECODE_SOURCE})
 
 
 # These are required, regardless of whether Vulkan is enabled or not

diff --git a/setup.py b/setup.py
@@ -191,6 +191,12 @@ def prepare_package(self):
                     print(f"Fetching runtime file {f} to {target} folder")
                     shutil.copy(os.path.join(llvm_runtime_dir, f), target)
 
+            ui_kernel_dir = 'taichi/ui/backends/vulkan'
+            for f in os.listdir(ui_kernel_dir):
+                if f.endswith('.bc'):
+                    print(f"Fetching ui kernel file {f} to {target} folder")
+                    shutil.copy(os.path.join(ui_kernel_dir, f), target)
+
 
 setup(name=project_name,
       packages=packages,

diff --git a/taichi/backends/cuda/cuda_context.cpp b/taichi/backends/cuda/cuda_context.cpp
@@ -83,8 +83,14 @@ void CUDAContext::launch(void *func,
   // Make sure there are not too many threads for the device.
   // Note that the CUDA random number generator does not allow more than
   // [saturating_grid_dim * max_block_dim] threads.
-  TI_ASSERT(grid_dim <= get_current_program().config.saturating_grid_dim);
-  TI_ASSERT(block_dim <= get_current_program().config.max_block_dim);
+
+  // These asserts are currently remove so that when GGUI calls CUDA kernels,
+  // the grid and block dim won't be limited by the limits set by Program. With
+  // these limits, GGUI would have to use kernels with grid strided loops, which
+  // is harmful to performance. A simple example of rendering a bunny can drop
+  // from 2000FPS to 1000FPS because of this. TI_ASSERT(grid_dim <=
+  // get_current_program().config.saturating_grid_dim); TI_ASSERT(block_dim <=
+  // get_current_program().config.max_block_dim);
 
   if (grid_dim > 0) {
     std::lock_guard<std::mutex> _(lock);

diff --git a/taichi/backends/cuda/cuda_driver_functions.inc.h b/taichi/backends/cuda/cuda_driver_functions.inc.h
@@ -22,6 +22,7 @@ PER_CUDA_FUNCTION(stream_create, cuStreamCreate, void **, uint32);
 // Memory management
 PER_CUDA_FUNCTION(memcpy_host_to_device, cuMemcpyHtoD_v2, void *, void *, std::size_t);
 PER_CUDA_FUNCTION(memcpy_device_to_host, cuMemcpyDtoH_v2, void *, void *, std::size_t);
+PER_CUDA_FUNCTION(memcpy_device_to_device, cuMemcpyDtoD_v2, void *, void *, std::size_t);
 PER_CUDA_FUNCTION(memcpy_host_to_device_async, cuMemcpyHtoDAsync_v2, void *, void *, std::size_t, void *);
 PER_CUDA_FUNCTION(memcpy_device_to_host_async, cuMemcpyDtoHAsync_v2, void *, void *, std::size_t, void*);
 PER_CUDA_FUNCTION(malloc, cuMemAlloc_v2, void **, std::size_t);

diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp
@@ -1,139 +1,49 @@
-#include <memory>
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-
-#include "taichi/backends/cuda/cuda_context.h"
-#include "taichi/backends/cuda/cuda_driver.h"
-#include "taichi/jit/jit_session.h"
-#include "taichi/lang_util.h"
-#include "taichi/program/program.h"
-#include "taichi/system/timer.h"
-#include "taichi/util/file_sequence_writer.h"
-
-#define TI_RUNTIME_HOST
-#include "taichi/program/context.h"
-#undef TI_RUNTIME_HOST
+#include "taichi/backends/cuda/jit_cuda.h"
 
 TLANG_NAMESPACE_BEGIN
 
 #if defined(TI_WITH_CUDA)
-class JITModuleCUDA : public JITModule {
- private:
-  void *module;
 
- public:
-  explicit JITModuleCUDA(void *module) : module(module) {
-  }
-
-  void *lookup_function(const std::string &name) override {
-    // TODO: figure out why using the guard leads to wrong tests results
-    // auto context_guard = CUDAContext::get_instance().get_guard();
-    CUDAContext::get_instance().make_current();
-    void *func = nullptr;
-    auto t = Time::get_time();
-    auto err = CUDADriver::get_instance().module_get_function.call_with_warning(
-        &func, module, name.c_str());
-    if (err) {
-      TI_ERROR("Cannot look up function {}", name);
-    }
-    t = Time::get_time() - t;
-    TI_TRACE("CUDA module_get_function {} costs {} ms", name, t * 1000);
-    TI_ASSERT(func != nullptr);
-    return func;
-  }
-
-  void call(const std::string &name,
-            const std::vector<void *> &arg_pointers) override {
-    launch(name, 1, 1, 0, arg_pointers);
-  }
-
-  virtual void launch(const std::string &name,
-                      std::size_t grid_dim,
-                      std::size_t block_dim,
-                      std::size_t shared_mem_bytes,
-                      const std::vector<void *> &arg_pointers) override {
-    auto func = lookup_function(name);
-    CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
-                                       block_dim, shared_mem_bytes);
-  }
-
-  bool direct_dispatch() const override {
-    return false;
-  }
-};
-
-class JITSessionCUDA : public JITSession {
- public:
-  llvm::DataLayout data_layout;
-
-  explicit JITSessionCUDA(llvm::DataLayout data_layout)
-      : data_layout(data_layout) {
-  }
-
-  virtual JITModule *add_module(std::unique_ptr<llvm::Module> M,
-                                int max_reg) override {
-    auto ptx = compile_module_to_ptx(M);
-    if (get_current_program().config.print_kernel_nvptx) {
-      static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
-                                       "module NVPTX");
-      writer.write(ptx);
-    }
-    // TODO: figure out why using the guard leads to wrong tests results
-    // auto context_guard = CUDAContext::get_instance().get_guard();
-    CUDAContext::get_instance().make_current();
-    // Create module for object
-    void *cuda_module;
-    TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
-    auto t = Time::get_time();
-    TI_TRACE("Loading module...");
-    [[maybe_unused]] auto &&_ =
-        std::move(CUDAContext::get_instance().get_lock_guard());
-
-    constexpr int max_num_options = 8;
-    int num_options = 0;
-    uint32 options[max_num_options];
-    void *option_values[max_num_options];
-
-    // Insert options
-    if (max_reg != 0) {
-      options[num_options] = CU_JIT_MAX_REGISTERS;
-      option_values[num_options] = &max_reg;
-      num_options++;
-    }
-
-    TI_ASSERT(num_options <= max_num_options);
-
-    CUDADriver::get_instance().module_load_data_ex(
-        &cuda_module, ptx.c_str(), num_options, options, option_values);
-    TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
-    // cudaModules.push_back(cudaModule);
-    modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
-    return modules.back().get();
-  }
-
-  virtual llvm::DataLayout get_data_layout() override {
-    return data_layout;
-  }
-
-  static std::string compile_module_to_ptx(
-      std::unique_ptr<llvm::Module> &module);
-};
+JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
+                                       int max_reg) {
+  auto ptx = compile_module_to_ptx(M);
+  if (get_current_program().config.print_kernel_nvptx) {
+    static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
+                                     "module NVPTX");
+    writer.write(ptx);
+  }
+  // TODO: figure out why using the guard leads to wrong tests results
+  // auto context_guard = CUDAContext::get_instance().get_guard();
+  CUDAContext::get_instance().make_current();
+  // Create module for object
+  void *cuda_module;
+  TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
+  auto t = Time::get_time();
+  TI_TRACE("Loading module...");
+  [[maybe_unused]] auto &&_ =
+      std::move(CUDAContext::get_instance().get_lock_guard());
+
+  constexpr int max_num_options = 8;
+  int num_options = 0;
+  uint32 options[max_num_options];
+  void *option_values[max_num_options];
+
+  // Insert options
+  if (max_reg != 0) {
+    options[num_options] = CU_JIT_MAX_REGISTERS;
+    option_values[num_options] = &max_reg;
+    num_options++;
+  }
+
+  TI_ASSERT(num_options <= max_num_options);
+
+  CUDADriver::get_instance().module_load_data_ex(
+      &cuda_module, ptx.c_str(), num_options, options, option_values);
+  TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
+  // cudaModules.push_back(cudaModule);
+  modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
+  return modules.back().get();
+}
 
 std::string cuda_mattrs() {
   return "+ptx63";

diff --git a/taichi/backends/cuda/jit_cuda.h b/taichi/backends/cuda/jit_cuda.h
@@ -0,0 +1,104 @@
+#include <memory>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+
+#include "taichi/backends/cuda/cuda_context.h"
+#include "taichi/backends/cuda/cuda_driver.h"
+#include "taichi/jit/jit_session.h"
+#include "taichi/lang_util.h"
+#include "taichi/program/program.h"
+#include "taichi/system/timer.h"
+#include "taichi/util/file_sequence_writer.h"
+
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
+
+TLANG_NAMESPACE_BEGIN
+
+#if defined(TI_WITH_CUDA)
+class JITModuleCUDA : public JITModule {
+ private:
+  void *module;
+
+ public:
+  explicit JITModuleCUDA(void *module) : module(module) {
+  }
+
+  void *lookup_function(const std::string &name) override {
+    // TODO: figure out why using the guard leads to wrong tests results
+    // auto context_guard = CUDAContext::get_instance().get_guard();
+    CUDAContext::get_instance().make_current();
+    void *func = nullptr;
+    auto t = Time::get_time();
+    auto err = CUDADriver::get_instance().module_get_function.call_with_warning(
+        &func, module, name.c_str());
+    if (err) {
+      TI_ERROR("Cannot look up function {}", name);
+    }
+    t = Time::get_time() - t;
+    TI_TRACE("CUDA module_get_function {} costs {} ms", name, t * 1000);
+    TI_ASSERT(func != nullptr);
+    return func;
+  }
+
+  void call(const std::string &name,
+            const std::vector<void *> &arg_pointers) override {
+    launch(name, 1, 1, 0, arg_pointers);
+  }
+
+  virtual void launch(const std::string &name,
+                      std::size_t grid_dim,
+                      std::size_t block_dim,
+                      std::size_t shared_mem_bytes,
+                      const std::vector<void *> &arg_pointers) override {
+    auto func = lookup_function(name);
+    CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
+                                       block_dim, shared_mem_bytes);
+  }
+
+  bool direct_dispatch() const override {
+    return false;
+  }
+};
+
+class JITSessionCUDA : public JITSession {
+ public:
+  llvm::DataLayout data_layout;
+
+  explicit JITSessionCUDA(llvm::DataLayout data_layout)
+      : data_layout(data_layout) {
+  }
+
+  virtual JITModule *add_module(std::unique_ptr<llvm::Module> M,
+                                int max_reg) override;
+
+  virtual llvm::DataLayout get_data_layout() override {
+    return data_layout;
+  }
+
+  static std::string compile_module_to_ptx(
+      std::unique_ptr<llvm::Module> &module);
+};
+
+#endif
+
+std::unique_ptr<JITSession> create_llvm_jit_session_cuda(Arch arch);
+
+TLANG_NAMESPACE_END
diff --git a/taichi/inc/cuda_kernel_utils.inc.h b/taichi/inc/cuda_kernel_utils.inc.h
@@ -0,0 +1,26 @@
+extern "C" {
+
+int thread_idx() {
+  return 0;
+}
+
+int warp_size() {
+  return 32;
+}
+
+int warp_idx() {
+  return thread_idx() % warp_size();
+}
+
+int block_idx() {
+  return 0;
+}
+
+int block_dim() {
+  return 0;
+}
+
+int grid_dim() {
+  return 0;
+}
+}