Skip to content

Commit

Permalink
[RPC] Use tensorpipe for rpc communication (dmlc#3335)
Browse files Browse the repository at this point in the history
* doesn't know whether works

* add change

* fix

* fix

* fix

* remove

* revert

* lint

* lint

* fix

* revert

* lint

* fix

* only build rpc on linux

* lint

* lint

* fix build on windows

* fix windows

* remove old test

* fix cmake

* Revert "remove old test"

This reverts commit f1ea75c.

* fix windows

* fix

* fix

* fix indent

* fix indent

* address comment

* fix

* fix

* fix

* fix

* fix

* lint

* fix indent

* fix lint

* add introduction

* fix

* lint

* lint

* add more logs

* fix

* update xbyak for C++14 with gcc5

* Remove channels

* fix

* add test script

* fix

* remove unused file

* fix lint

* add timeout
  • Loading branch information
VoVAllen authored Dec 6, 2021
1 parent 987db37 commit a3ce780
Show file tree
Hide file tree
Showing 15 changed files with 850 additions and 221 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
[submodule "third_party/libxsmm"]
path = third_party/libxsmm
url = https://github.com/hfp/libxsmm.git
[submodule "third_party/tensorpipe"]
path = third_party/tensorpipe
url = https://github.com/pytorch/tensorpipe
[submodule "third_party/thrust"]
path = third_party/thrust
url = https://github.com/NVIDIA/thrust.git
21 changes: 20 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,17 @@ file(GLOB_RECURSE DGL_SRC_1
src/api/*.cc
src/graph/*.cc
src/scheduler/*.cc
src/rpc/*.cc
)

list(APPEND DGL_SRC ${DGL_SRC_1})

if (NOT MSVC)
file(GLOB_RECURSE DGL_RPC_SRC src/rpc/*.cc)
else()
file(GLOB_RECURSE DGL_RPC_SRC src/rpc/network/*.cc)
endif()
list(APPEND DGL_SRC ${DGL_RPC_SRC})

# Configure cuda
if(USE_CUDA)
dgl_config_cuda(DGL_CUDA_SRC)
Expand Down Expand Up @@ -198,6 +204,8 @@ else(USE_CUDA)
add_library(dgl SHARED ${DGL_SRC})
endif(USE_CUDA)

set_property(TARGET dgl PROPERTY CXX_STANDARD 14)

# include directories
target_include_directories(dgl PRIVATE "include")
target_include_directories(dgl PRIVATE "third_party/dlpack/include")
Expand All @@ -209,6 +217,7 @@ target_include_directories(dgl PRIVATE "tensoradapter/include")
target_include_directories(dgl PRIVATE "third_party/nanoflann/include")
target_include_directories(dgl PRIVATE "third_party/libxsmm/include")


# For serialization
if (USE_HDFS)
option(DMLC_HDFS_SHARED "dgl has to build with dynamic hdfs library" ON)
Expand Down Expand Up @@ -242,6 +251,16 @@ if((NOT MSVC) AND USE_LIBXSMM)
list(APPEND DGL_LINKER_LIBS -L${CMAKE_SOURCE_DIR}/third_party/libxsmm/lib/ xsmm)
endif((NOT MSVC) AND USE_LIBXSMM)

if(NOT MSVC)
# Only build tensorpipe on linux
string(REPLACE "-pedantic" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
set(TP_BUILD_LIBUV ON)
set(TP_STATIC_OR_SHARED STATIC)
add_subdirectory(third_party/tensorpipe)
list(APPEND DGL_LINKER_LIBS tensorpipe)
target_include_directories(dgl PRIVATE third_party/tensorpipe)
endif(NOT MSVC)

# Compile TVM Runtime and Featgraph
# (NOTE) We compile a dynamic library called featgraph_runtime, which the DGL library links to.
# Kernels are packed in a separate dynamic library called featgraph_kernels, which DGL
Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def unit_test_linux(backend, dev) {
def unit_test_win64(backend, dev) {
init_git_win64()
unpack_lib("dgl-${dev}-win64", dgl_win64_libs)
timeout(time: 10, unit: 'MINUTES') {
timeout(time: 20, unit: 'MINUTES') {
bat "CALL tests\\scripts\\task_unit_test.bat ${backend}"
}
}
Expand Down
4 changes: 2 additions & 2 deletions python/dgl/distributed/rpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def start_server(server_id, ip_config, num_servers, num_clients, server_state, \
"""
assert server_id >= 0, 'server_id (%d) cannot be a negative number.' % server_id
assert num_servers > 0, 'num_servers (%d) must be a positive number.' % num_servers
assert num_clients >= 0, 'num_client (%d) cannot be a negative number.' % num_client
assert max_queue_size > 0, 'queue_size (%d) cannot be a negative number.' % queue_size
assert num_clients >= 0, 'num_client (%d) cannot be a negative number.' % num_clients
assert max_queue_size > 0, 'queue_size (%d) cannot be a negative number.' % max_queue_size
assert net_type in ('socket'), 'net_type (%s) can only be \'socket\'' % net_type
# Register signal handler.
rpc.register_sig_handler()
Expand Down
2 changes: 1 addition & 1 deletion src/graph/serialize/zerocopy_serializer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct RawDataTensorCtx {

void RawDataTensoDLPackDeleter(DLManagedTensor* tensor) {
auto ctx = static_cast<RawDataTensorCtx*>(tensor->manager_ctx);
free(ctx->tensor.dl_tensor.data);
delete[] ctx->tensor.dl_tensor.data;
delete ctx;
}

Expand Down
Loading

0 comments on commit a3ce780

Please sign in to comment.