[Performance] Use allocator from PyTorch if possible (dmlc#2328)

* first commit * some thoughts * move around * more commit * more fixes * now it uses torch allocator * fix symbol export error * fix * fixes * test fix * add script * building separate library per version * fix for vs2019 * more fixes * fix on windows build * update jenkinsfile * auto copy built dlls for windows * lint and installation guide update * fix * specify conda environment * set environment for ci * fix * fix * fix * fix again * revert * fix cmake * fix * switch to using python interpreter path * remove scripts * debug * oops sorry * Update index.rst * Update index.rst * copies automatically, no need for this * do not print message if library not found * tiny fixes * debug on nightly * replace add_compile_definitions to make CMake 3.5 happy * fix linking to wrong lib for multiple pytorch envs * changed building strategy * fix nightly * fix windows * fix windows again * setup bugfix * address comments * change README
skorani · Dec 25, 2020 · 9a7235f · 9a7235f
1 parent 4444a43
commit 9a7235f
Show file tree

Hide file tree

Showing 26 changed files with 616 additions and 58 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,9 +61,13 @@ include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
 include_directories("third_party/phmap/")
 include_directories("third_party/xbyak/")
+include_directories("tensoradapter/include")
 
 # initial variables
-set(DGL_LINKER_LIBS "")
+if(NOT MSVC)
+set(DGL_LINKER_LIBS "dl")
+endif(NOT MSVC)
+
 if(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
 set(DGL_RUNTIME_LINKER_LIBS "")
 else(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
@@ -110,7 +114,8 @@ if(USE_OPENMP)
 endif(USE_OPENMP)
 
 if(USE_AVX)
-  add_compile_definitions(USE_AVX)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
 endif(USE_AVX)
 
 # To compile METIS correct for DGL.
@@ -183,6 +188,46 @@ if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 
 target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})
+if(MSVC)
+  add_custom_command(
+    TARGET dgl POST_BUILD COMMAND
+    cmd.exe /c "COPY /Y Release\\dgl.dll .")
+endif(MSVC)
+
+# Tensor adapter libraries
+# Linking against LibTorch involves linking against a bunch of other libraries
+# returned by PyTorch's CMake (e.g. C10 or NVTools).  Because CMake caches
+# the found libraries in find_library(), often times CMake will look into the libraries
+# of the wrong version when I build everything in the same CMake process.  As
+# a result, I (BarclayII) am launching an individual CMake build for every PyTorch version.
+if(BUILD_TORCH)
+  file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
+  file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
+  if(MSVC)
+    file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.bat BUILD_SCRIPT)
+    add_custom_target(
+      tensoradapter_pytorch
+      ${CMAKE_COMMAND} -E env
+      CMAKE_COMMAND=${CMAKE_CMD}
+      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      BINDIR=${BINDIR}
+      cmd /e:on /c ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
+      DEPENDS ${BUILD_SCRIPT}
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
+  else(MSVC)
+    file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.sh BUILD_SCRIPT)
+    add_custom_target(
+      tensoradapter_pytorch
+      ${CMAKE_COMMAND} -E env
+      CMAKE_COMMAND=${CMAKE_CMD}
+      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      BINDIR=${CMAKE_CURRENT_BINARY_DIR}
+      bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
+      DEPENDS ${BUILD_SCRIPT}
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
+  endif(MSVC)
+  add_dependencies(dgl tensoradapter_pytorch)
+endif(BUILD_TORCH)
 
 # Installation rules
 install(TARGETS dgl DESTINATION lib${LIB_SUFFIX})

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,8 +1,8 @@
 #!/usr/bin/env groovy
 
-dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so"
+dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so, build/tensoradapter/pytorch/*.so"
 // Currently DGL on Windows is not working with Cython yet
-dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe"
+dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe, build\\tensoradapter\\pytorch\\*.dll"
 
 def init_git() {
   sh "rm -rf *"

diff --git a/README.md b/README.md
@@ -288,9 +288,7 @@ Right now, DGL works on [PyTorch](https://pytorch.org) 1.5.0+, [MXNet](https://m
 
 ```
 conda install -c dglteam dgl           # cpu version
-conda install -c dglteam dgl-cuda9.0   # CUDA 9.0
 conda install -c dglteam dgl-cuda9.2   # CUDA 9.2
-conda install -c dglteam dgl-cuda10.0  # CUDA 10.0
 conda install -c dglteam dgl-cuda10.1  # CUDA 10.1
 conda install -c dglteam dgl-cuda10.2  # CUDA 10.2
 conda install -c dglteam dgl-cuda11.0  # CUDA 11.0
@@ -302,9 +300,7 @@ conda install -c dglteam dgl-cuda11.0  # CUDA 11.0
 |           | Latest Nightly Build Version  | Stable Version          |
 |-----------|-------------------------------|-------------------------|
 | CPU       | `pip install --pre dgl`       | `pip install dgl`       |
-| CUDA 9.0  | `pip install --pre dgl-cu90`  | `pip install dgl-cu90`  |
 | CUDA 9.2  | `pip install --pre dgl-cu92`  | `pip install dgl-cu92`  |
-| CUDA 10.0 | `pip install --pre dgl-cu100` | `pip install dgl-cu100` |
 | CUDA 10.1 | `pip install --pre dgl-cu101` | `pip install dgl-cu101` |
 | CUDA 10.2 | `pip install --pre dgl-cu102` | `pip install dgl-cu102` |
 | CUDA 11.0 | `pip install --pre dgl-cu110` | `pip install dgl-cu110` |

diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -42,3 +42,6 @@ set(USE_OPENMP ON)
 
 # Whether to enable Intel's avx optimized kernel
 set(USE_AVX ON)
+
+# Whether to build PyTorch plugins
+set(BUILD_TORCH ON)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
@@ -246,6 +246,9 @@ macro(dgl_config_cuda out_variable)
   # 0. Add host flags
   message(STATUS "${CMAKE_CXX_FLAGS}")
   string(REGEX REPLACE "[ \t\n\r]" "," CXX_HOST_FLAGS "${CMAKE_CXX_FLAGS}")
+  if(MSVC AND NOT USE_MSVC_MT)
+    string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
+  endif()
   list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ,${CXX_HOST_FLAGS}")
 
   # 1. Add arch flags
@@ -260,7 +263,7 @@ macro(dgl_config_cuda out_variable)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++14"    SUPPORT_CXX14)
   string(REPLACE "-std=c++11" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-  list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
+  list(APPEND CUDA_NVCC_FLAGS "-std=c++14")
 
   message(STATUS "CUDA flags: ${CUDA_NVCC_FLAGS}")
 

diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
@@ -11,7 +11,7 @@ DGL works with the following operating systems:
 * macOS X
 * Windows 10
 
-DGL requires Python version 3.6 or later.
+DGL requires Python version 3.6, 3.7, 3.8 or 3.9.
 
 DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.
 
@@ -121,34 +121,32 @@ install the Python binding for DGL.
 Windows
 ```````
 
-The Windows source build is tested with CMake and MinGW/GCC.  We highly recommend
-using CMake and GCC from `conda installations <https://conda.io/miniconda.html>`_.  To
-get started, run the following:
-
-.. code:: bash
-
-   conda install cmake m2w64-gcc m2w64-make
-
-Build the shared library and install the Python binding.
+You can build DGL with MSBuild.  With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
+and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
+in VS2019 x64 Native tools command prompt.
 
-.. code::
+- CPU only build
+  .. code::
 
-   md build
-   cd build
-   cmake -DCMAKE_CXX_FLAGS="-DDMLC_LOG_STACK_TRACE=0 -DDGL_EXPORTS" -DCMAKE_MAKE_PROGRAM=mingw32-make .. -G "MSYS Makefiles"
-   mingw32-make
-   cd ..\python
-   python setup.py install
+     MD build
+     CD build
+     cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON .. -G "Visual Studio 16 2019"
+     msbuild dgl.sln /m
+     CD ..\python
+     python setup.py install
+- CUDA build
+  .. code::
 
-You can also build DGL with MSBuild.  With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
-and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
-in VS2017 x64 Native tools command prompt.
+     MD build
+     CD build
+     cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON -DUSE_CUDA=ON .. -G "Visual Studio 16 2019"
+     msbuild dgl.sln /m
+     CD ..\python
+     python setup.py install
 
-.. code::
+Optional Flags
+``````````````
 
-   MD build
-   CD build
-   cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" .. -G "Visual Studio 15 2017 Win64"
-   msbuild dgl.sln
-   cd ..\python
-   python setup.py install
+- If you are using PyTorch, you can add ``-DBUILD_TORCH=ON`` flag in CMake
+  to build PyTorch plugins for further performance optimization.  This applies for Linux,
+  Windows, and Mac.
diff --git a/include/dgl/runtime/c_runtime_api.h b/include/dgl/runtime/c_runtime_api.h
@@ -540,6 +540,11 @@ DGL_DLL int DGLStreamStreamSynchronize(int device_type,
                                        DGLStreamHandle src,
                                        DGLStreamHandle dst);
 
+/*!
+ * \brief Sets the path to the tensoradapter library
+ */
+DGL_DLL void DGLSetTAPath(const char *path_cstr);
+
 /*!
  * \brief Bug report macro.
  *

diff --git a/include/dgl/runtime/env.h b/include/dgl/runtime/env.h
@@ -0,0 +1,24 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dgl/runtime/env.h
+ * \brief Structure for holding DGL global environment variables
+ */
+
+#ifndef DGL_RUNTIME_ENV_H_
+#define DGL_RUNTIME_ENV_H_
+
+#include <string>
+
+/*!
+ * \brief Global environment variables.
+ */
+struct Env {
+  static Env* Global() {
+    static Env inst;
+    return &inst;
+  }
+  /*! \brief the path to the tensoradapter library */
+  std::string ta_path;
+};
+
+#endif  // DGL_RUNTIME_ENV_H_
diff --git a/include/dgl/runtime/tensordispatch.h b/include/dgl/runtime/tensordispatch.h
@@ -0,0 +1,114 @@
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/tensordispatch.h
+ * \brief This file defines the dispatcher of tensor operators to framework-specific
+ *  implementations.
+ *
+ *  The dispatcher consists of a TensorDispatcher singleton in DGL C library and
+ *  one separately-built shared library per supported backend.
+ *
+ *  Those shared libraries contain wrappers of the framework-specific operators.
+ *  The wrappers have almost the same signatures as functions in aten namespace,
+ *  except that they accept and return DLManagedTensors instead of NDArrays.
+ *  The wrappers are defined with extern "C", meaning that the C++ compiler will
+ *  not do name mangling for those functions so that DGL can conveniently locate
+ *  them using dlsym(3) (or GetProcAddress in Windows).
+ *
+ *  The TensorDispatcher singleton maintains a mapping from an array operator to
+ *  the address of the corresponding symbol in the shared library.  During
+ *  initialization, the TensorDispatcher checks which backend DGL is using.
+ *  It then locates and opens the corresponding shared library using dlopen(3) (or
+ *  LoadLibrary in Windows), and populates the said mapping above with dlsym(3)
+ *  (or GetProcAddress in Windows).
+ *
+ *  A tensor operator in TensorDispatcher first checks whether the corresponding symbol
+ *  address is found in the mapping.  If so, it calls the function located at the
+ *  symbol address instead, translating NDArrays to DLManagedTensors using
+ *  NDArray::ToDLPack(), and translates the DLManagedTensors in the return values
+ *  back to NDArrays using NDArray::FromDLPack().  If not, it falls back to the
+ *  implementation in dgl::aten namespace.
+ */
+
+#ifndef DGL_RUNTIME_TENSORDISPATCH_H_
+#define DGL_RUNTIME_TENSORDISPATCH_H_
+
+#include <dlpack/dlpack.h>
+#include <tensoradapter.h>
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#endif  // WIN32
+#include <vector>
+#include "ndarray.h"
+
+/*! \brief Casts a pointer \c entry to a function pointer with signature of \c func */
+#define FUNCCAST(func, entry)   (*reinterpret_cast<decltype(&(func))>(entry))
+
+namespace dgl {
+namespace runtime {
+
+/*!
+ * \brief Dispatcher that delegates the function calls to framework-specific C++ APIs.
+ */
+class TensorDispatcher {
+ public:
+  /*! \brief Get the singleton instance. */
+  static TensorDispatcher* Global() {
+    static TensorDispatcher inst;
+    return &inst;
+  }
+
+  /*! \brief Whether an adapter library is available */
+  inline bool IsAvailable() {
+    return available_;
+  }
+
+  /*!
+   * \brief Allocate an empty tensor.
+   *
+   * Used in NDArray::Empty().
+   */
+  inline NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) const {
+    auto entry = entrypoints_[Op::kEmpty];
+    auto result = FUNCCAST(tensoradapter::TAempty, entry)(shape, dtype, ctx);
+    return NDArray::FromDLPack(result);
+  }
+
+ private:
+  /*! \brief ctor */
+  TensorDispatcher();
+  /*! \brief dtor */
+  ~TensorDispatcher();
+
+  /*!
+   * \brief List of symbols in the adapter library.
+   *
+   * Must match the functions in tensoradapter/include/tensoradapter.h.
+   */
+  static constexpr const char *names_[] = {
+    "TAempty",
+  };
+
+  /*! \brief Index of each function to the symbol list */
+  class Op {
+   public:
+    static constexpr int kEmpty = 0;
+  };
+
+  /*! \brief Number of functions */
+  static constexpr int num_entries_ = sizeof(names_) / sizeof(names_[0]);
+
+  /*! \brief Entrypoints of each function */
+  void* entrypoints_[num_entries_] = {nullptr};
+
+  bool available_ = false;
+#if defined(WIN32) || defined(_WIN32)
+  HINSTANCE handle_;
+#else   // !WIN32
+  void* handle_;
+#endif  // WIN32
+};
+
+};  // namespace runtime
+};  // namespace dgl
+
+#endif  // DGL_RUNTIME_TENSORDISPATCH_H_
diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py
@@ -9,8 +9,7 @@
 # This initializes Winsock and performs cleanup at termination as required
 import socket
 
-# Need to ensure that the backend framework is imported before load dgl libs,
-# otherwise weird cuda problem happens
+# Should import backend before importing anything else
 from .backend import load_backend, backend_name
 
 from . import function