Skip to content

Commit

Permalink
[Performance] Use allocator from PyTorch if possible (dmlc#2328)
Browse files Browse the repository at this point in the history
* first commit

* some thoughts

* move around

* more commit

* more fixes

* now it uses torch allocator

* fix symbol export error

* fix

* fixes

* test fix

* add script

* building separate library per version

* fix for vs2019

* more fixes

* fix on windows build

* update jenkinsfile

* auto copy built dlls for windows

* lint and installation guide update

* fix

* specify conda environment

* set environment for ci

* fix

* fix

* fix

* fix again

* revert

* fix cmake

* fix

* switch to using python interpreter path

* remove scripts

* debug

* oops sorry

* Update index.rst

* Update index.rst

* copies automatically, no need for this

* do not print message if library not found

* tiny fixes

* debug on nightly

* replace add_compile_definitions to make CMake 3.5 happy

* fix linking to wrong lib for multiple pytorch envs

* changed building strategy

* fix nightly

* fix windows

* fix windows again

* setup bugfix

* address comments

* change README
  • Loading branch information
BarclayII authored Dec 25, 2020
1 parent 4444a43 commit 9a7235f
Show file tree
Hide file tree
Showing 26 changed files with 616 additions and 58 deletions.
49 changes: 47 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,13 @@ include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/phmap/")
include_directories("third_party/xbyak/")
include_directories("tensoradapter/include")

# initial variables
set(DGL_LINKER_LIBS "")
if(NOT MSVC)
set(DGL_LINKER_LIBS "dl")
endif(NOT MSVC)

if(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
set(DGL_RUNTIME_LINKER_LIBS "")
else(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
Expand Down Expand Up @@ -110,7 +114,8 @@ if(USE_OPENMP)
endif(USE_OPENMP)

if(USE_AVX)
add_compile_definitions(USE_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
endif(USE_AVX)

# To compile METIS correct for DGL.
Expand Down Expand Up @@ -183,6 +188,46 @@ if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)

target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})
if(MSVC)
add_custom_command(
TARGET dgl POST_BUILD COMMAND
cmd.exe /c "COPY /Y Release\\dgl.dll .")
endif(MSVC)

# Tensor adapter libraries
# Linking against LibTorch involves linking against a bunch of other libraries
# returned by PyTorch's CMake (e.g. C10 or NVTools). Because CMake caches
# the found libraries in find_library(), often times CMake will look into the libraries
# of the wrong version when I build everything in the same CMake process. As
# a result, I (BarclayII) am launching an individual CMake build for every PyTorch version.
if(BUILD_TORCH)
file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
if(MSVC)
file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.bat BUILD_SCRIPT)
add_custom_target(
tensoradapter_pytorch
${CMAKE_COMMAND} -E env
CMAKE_COMMAND=${CMAKE_CMD}
CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
BINDIR=${BINDIR}
cmd /e:on /c ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
DEPENDS ${BUILD_SCRIPT}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
else(MSVC)
file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.sh BUILD_SCRIPT)
add_custom_target(
tensoradapter_pytorch
${CMAKE_COMMAND} -E env
CMAKE_COMMAND=${CMAKE_CMD}
CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
BINDIR=${CMAKE_CURRENT_BINARY_DIR}
bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
DEPENDS ${BUILD_SCRIPT}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
endif(MSVC)
add_dependencies(dgl tensoradapter_pytorch)
endif(BUILD_TORCH)

# Installation rules
install(TARGETS dgl DESTINATION lib${LIB_SUFFIX})
Expand Down
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env groovy

dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so"
dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so, build/tensoradapter/pytorch/*.so"
// Currently DGL on Windows is not working with Cython yet
dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe"
dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe, build\\tensoradapter\\pytorch\\*.dll"

def init_git() {
sh "rm -rf *"
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,7 @@ Right now, DGL works on [PyTorch](https://pytorch.org) 1.5.0+, [MXNet](https://m

```
conda install -c dglteam dgl # cpu version
conda install -c dglteam dgl-cuda9.0 # CUDA 9.0
conda install -c dglteam dgl-cuda9.2 # CUDA 9.2
conda install -c dglteam dgl-cuda10.0 # CUDA 10.0
conda install -c dglteam dgl-cuda10.1 # CUDA 10.1
conda install -c dglteam dgl-cuda10.2 # CUDA 10.2
conda install -c dglteam dgl-cuda11.0 # CUDA 11.0
Expand All @@ -302,9 +300,7 @@ conda install -c dglteam dgl-cuda11.0 # CUDA 11.0
| | Latest Nightly Build Version | Stable Version |
|-----------|-------------------------------|-------------------------|
| CPU | `pip install --pre dgl` | `pip install dgl` |
| CUDA 9.0 | `pip install --pre dgl-cu90` | `pip install dgl-cu90` |
| CUDA 9.2 | `pip install --pre dgl-cu92` | `pip install dgl-cu92` |
| CUDA 10.0 | `pip install --pre dgl-cu100` | `pip install dgl-cu100` |
| CUDA 10.1 | `pip install --pre dgl-cu101` | `pip install dgl-cu101` |
| CUDA 10.2 | `pip install --pre dgl-cu102` | `pip install dgl-cu102` |
| CUDA 11.0 | `pip install --pre dgl-cu110` | `pip install dgl-cu110` |
Expand Down
3 changes: 3 additions & 0 deletions cmake/config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ set(USE_OPENMP ON)

# Whether to enable Intel's avx optimized kernel
set(USE_AVX ON)

# Whether to build PyTorch plugins
set(BUILD_TORCH ON)
5 changes: 4 additions & 1 deletion cmake/modules/CUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ macro(dgl_config_cuda out_variable)
# 0. Add host flags
message(STATUS "${CMAKE_CXX_FLAGS}")
string(REGEX REPLACE "[ \t\n\r]" "," CXX_HOST_FLAGS "${CMAKE_CXX_FLAGS}")
if(MSVC AND NOT USE_MSVC_MT)
string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
endif()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ,${CXX_HOST_FLAGS}")

# 1. Add arch flags
Expand All @@ -260,7 +263,7 @@ macro(dgl_config_cuda out_variable)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
string(REPLACE "-std=c++11" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
list(APPEND CUDA_NVCC_FLAGS "-std=c++14")

message(STATUS "CUDA flags: ${CUDA_NVCC_FLAGS}")

Expand Down
52 changes: 25 additions & 27 deletions docs/source/install/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ DGL works with the following operating systems:
* macOS X
* Windows 10

DGL requires Python version 3.6 or later.
DGL requires Python version 3.6, 3.7, 3.8 or 3.9.

DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.

Expand Down Expand Up @@ -121,34 +121,32 @@ install the Python binding for DGL.
Windows
```````

The Windows source build is tested with CMake and MinGW/GCC. We highly recommend
using CMake and GCC from `conda installations <https://conda.io/miniconda.html>`_. To
get started, run the following:

.. code:: bash
conda install cmake m2w64-gcc m2w64-make
Build the shared library and install the Python binding.
You can build DGL with MSBuild. With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
in VS2019 x64 Native tools command prompt.

.. code::
- CPU only build
.. code::
md build
cd build
cmake -DCMAKE_CXX_FLAGS="-DDMLC_LOG_STACK_TRACE=0 -DDGL_EXPORTS" -DCMAKE_MAKE_PROGRAM=mingw32-make .. -G "MSYS Makefiles"
mingw32-make
cd ..\python
python setup.py install
MD build
CD build
cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON .. -G "Visual Studio 16 2019"
msbuild dgl.sln /m
CD ..\python
python setup.py install
- CUDA build
.. code::
You can also build DGL with MSBuild. With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
in VS2017 x64 Native tools command prompt.
MD build
CD build
cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON -DUSE_CUDA=ON .. -G "Visual Studio 16 2019"
msbuild dgl.sln /m
CD ..\python
python setup.py install
.. code::
Optional Flags
``````````````

MD build
CD build
cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" .. -G "Visual Studio 15 2017 Win64"
msbuild dgl.sln
cd ..\python
python setup.py install
- If you are using PyTorch, you can add ``-DBUILD_TORCH=ON`` flag in CMake
to build PyTorch plugins for further performance optimization. This applies for Linux,
Windows, and Mac.
5 changes: 5 additions & 0 deletions include/dgl/runtime/c_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,11 @@ DGL_DLL int DGLStreamStreamSynchronize(int device_type,
DGLStreamHandle src,
DGLStreamHandle dst);

/*!
* \brief Sets the path to the tensoradapter library
*/
DGL_DLL void DGLSetTAPath(const char *path_cstr);

/*!
* \brief Bug report macro.
*
Expand Down
24 changes: 24 additions & 0 deletions include/dgl/runtime/env.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*!
* Copyright (c) 2017 by Contributors
* \file dgl/runtime/env.h
* \brief Structure for holding DGL global environment variables
*/

#ifndef DGL_RUNTIME_ENV_H_
#define DGL_RUNTIME_ENV_H_

#include <string>

/*!
* \brief Global environment variables.
*/
struct Env {
static Env* Global() {
static Env inst;
return &inst;
}
/*! \brief the path to the tensoradapter library */
std::string ta_path;
};

#endif // DGL_RUNTIME_ENV_H_
114 changes: 114 additions & 0 deletions include/dgl/runtime/tensordispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*!
* Copyright (c) 2020 by Contributors
* \file array/tensordispatch.h
* \brief This file defines the dispatcher of tensor operators to framework-specific
* implementations.
*
* The dispatcher consists of a TensorDispatcher singleton in DGL C library and
* one separately-built shared library per supported backend.
*
* Those shared libraries contain wrappers of the framework-specific operators.
* The wrappers have almost the same signatures as functions in aten namespace,
* except that they accept and return DLManagedTensors instead of NDArrays.
* The wrappers are defined with extern "C", meaning that the C++ compiler will
* not do name mangling for those functions so that DGL can conveniently locate
* them using dlsym(3) (or GetProcAddress in Windows).
*
* The TensorDispatcher singleton maintains a mapping from an array operator to
* the address of the corresponding symbol in the shared library. During
* initialization, the TensorDispatcher checks which backend DGL is using.
* It then locates and opens the corresponding shared library using dlopen(3) (or
* LoadLibrary in Windows), and populates the said mapping above with dlsym(3)
* (or GetProcAddress in Windows).
*
* A tensor operator in TensorDispatcher first checks whether the corresponding symbol
* address is found in the mapping. If so, it calls the function located at the
* symbol address instead, translating NDArrays to DLManagedTensors using
* NDArray::ToDLPack(), and translates the DLManagedTensors in the return values
* back to NDArrays using NDArray::FromDLPack(). If not, it falls back to the
* implementation in dgl::aten namespace.
*/

#ifndef DGL_RUNTIME_TENSORDISPATCH_H_
#define DGL_RUNTIME_TENSORDISPATCH_H_

#include <dlpack/dlpack.h>
#include <tensoradapter.h>
#if defined(WIN32) || defined(_WIN32)
#include <windows.h>
#endif // WIN32
#include <vector>
#include "ndarray.h"

/*! \brief Casts a pointer \c entry to a function pointer with signature of \c func */
#define FUNCCAST(func, entry) (*reinterpret_cast<decltype(&(func))>(entry))

namespace dgl {
namespace runtime {

/*!
* \brief Dispatcher that delegates the function calls to framework-specific C++ APIs.
*/
class TensorDispatcher {
public:
/*! \brief Get the singleton instance. */
static TensorDispatcher* Global() {
static TensorDispatcher inst;
return &inst;
}

/*! \brief Whether an adapter library is available */
inline bool IsAvailable() {
return available_;
}

/*!
* \brief Allocate an empty tensor.
*
* Used in NDArray::Empty().
*/
inline NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) const {
auto entry = entrypoints_[Op::kEmpty];
auto result = FUNCCAST(tensoradapter::TAempty, entry)(shape, dtype, ctx);
return NDArray::FromDLPack(result);
}

private:
/*! \brief ctor */
TensorDispatcher();
/*! \brief dtor */
~TensorDispatcher();

/*!
* \brief List of symbols in the adapter library.
*
* Must match the functions in tensoradapter/include/tensoradapter.h.
*/
static constexpr const char *names_[] = {
"TAempty",
};

/*! \brief Index of each function to the symbol list */
class Op {
public:
static constexpr int kEmpty = 0;
};

/*! \brief Number of functions */
static constexpr int num_entries_ = sizeof(names_) / sizeof(names_[0]);

/*! \brief Entrypoints of each function */
void* entrypoints_[num_entries_] = {nullptr};

bool available_ = false;
#if defined(WIN32) || defined(_WIN32)
HINSTANCE handle_;
#else // !WIN32
void* handle_;
#endif // WIN32
};

}; // namespace runtime
}; // namespace dgl

#endif // DGL_RUNTIME_TENSORDISPATCH_H_
3 changes: 1 addition & 2 deletions python/dgl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
# This initializes Winsock and performs cleanup at termination as required
import socket

# Need to ensure that the backend framework is imported before load dgl libs,
# otherwise weird cuda problem happens
# Should import backend before importing anything else
from .backend import load_backend, backend_name

from . import function
Expand Down
Loading

0 comments on commit 9a7235f

Please sign in to comment.