Skip to content

Commit

Permalink
[RPC] Refactoring networking APIs (dmlc#496)
Browse files Browse the repository at this point in the history
* Refactoring network API

* update demo

* update

* update demo

* update demo

* add num_sender

* update

* fix lint

* fix lint

* fix lint

* update
  • Loading branch information
aksnzhy authored Apr 16, 2019
1 parent 688a922 commit b89dcce
Show file tree
Hide file tree
Showing 12 changed files with 426 additions and 383 deletions.
2 changes: 1 addition & 1 deletion examples/mxnet/sampling/dis_sampling/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
### Demo for Distributed Sampler

First we need to change the `--ip` and `--port` in `run_trainer.sh` and `run_sampler.sh` for your own environemnt.
First we need to change the `--ip` in `run_trainer.sh` and `run_sampler.sh` for your own environemnt.

Then we need to start trainer node:

Expand Down
14 changes: 6 additions & 8 deletions examples/mxnet/sampling/dis_sampling/gcn_ns_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def worker(self, args):
"""User-defined worker function
"""
# Start sender
sender = dgl.contrib.sampling.SamplerSender(ip=args.ip, port=args.port)
namebook = { 0:args.ip }
sender = dgl.contrib.sampling.SamplerSender(namebook)

# load and preprocess dataset
data = load_data(args)
Expand All @@ -41,10 +42,9 @@ def worker(self, args):
num_hops=args.n_layers+1,
seed_nodes=train_nid):
print("send train nodeflow: %d" %(idx))
sender.send(nf)
sender.send(nf, 0)
idx += 1


def main(args):
pool = MySamplerPool()
pool.start(args.num_sender, args)
Expand All @@ -64,12 +64,10 @@ def main(args):
help="graph self-loop (default=False)")
parser.add_argument("--n-layers", type=int, default=1,
help="number of hidden gcn layers")
parser.add_argument("--ip", type=str, default='127.0.0.1',
help="ip address of remote trainer machine")
parser.add_argument("--port", type=int, default=2049,
help="listen port of remote trainer machine")
parser.add_argument("--ip", type=str, default='127.0.0.1:50051',
help="IP address of remote trainer machine")
parser.add_argument("--num-sender", type=int, default=1,
help="total number of sampler sender")
help="Number of sampler sender machine")
args = parser.parse_args()

print(args)
Expand Down
6 changes: 2 additions & 4 deletions examples/mxnet/sampling/dis_sampling/gcn_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def main(args):
data.graph.add_edges_from([(i,i) for i in range(len(data.graph))])

# Create sampler receiver
receiver = dgl.contrib.sampling.SamplerReceiver(ip=args.ip, port=args.port, num_sender=args.num_sender)
receiver = dgl.contrib.sampling.SamplerReceiver(addr=args.ip, num_sender=args.num_sender)

train_nid = mx.nd.array(np.nonzero(data.train_mask)[0]).astype(np.int64).as_in_context(ctx)
test_nid = mx.nd.array(np.nonzero(data.test_mask)[0]).astype(np.int64).as_in_context(ctx)
Expand Down Expand Up @@ -255,10 +255,8 @@ def main(args):
help="graph self-loop (default=False)")
parser.add_argument("--weight-decay", type=float, default=5e-4,
help="Weight for L2 loss")
parser.add_argument("--ip", type=str, default='127.0.0.1',
parser.add_argument("--ip", type=str, default='127.0.0.1:50051',
help="IP address of sampler receiver machine")
parser.add_argument("--port", type=int, default=2049,
help="Listening port of sampler receiver machine")
parser.add_argument("--num-sender", type=int, default=1,
help="Number of sampler sender machine")
args = parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion examples/mxnet/sampling/dis_sampling/run_sampler.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
DGLBACKEND=mxnet python3 gcn_ns_sampler.py --ip 127.0.0.1 --port 2049 --num-sender=5 --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --test-batch-size 500
DGLBACKEND=mxnet python3 gcn_ns_sampler.py --ip 127.0.0.1:2049 --num-sender=1 --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --test-batch-size 500
2 changes: 1 addition & 1 deletion examples/mxnet/sampling/dis_sampling/run_trainer.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
DGLBACKEND=mxnet python3 gcn_trainer.py --ip 127.0.0.1 --port 2049 --num-sender=5 --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --test-batch-size 500 --n-hidden 64
DGLBACKEND=mxnet python3 gcn_trainer.py --ip 127.0.0.1:2049 --num-sender=1 --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --test-batch-size 500 --n-hidden 64
90 changes: 51 additions & 39 deletions python/dgl/contrib/sampling/dis_sampler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# This file contains DGL distributed samplers APIs.
from ...network import _send_subgraph, _recv_subgraph
from ...network import _send_nodeflow, _recv_nodeflow
from ...network import _create_sender, _create_receiver
from ...network import _finalize_sender, _finalize_receiver
from ...network import _add_receiver_addr, _sender_connect, _receiver_wait

from multiprocessing import Pool
from abc import ABCMeta, abstractmethod
Expand All @@ -11,7 +12,8 @@ class SamplerPool(object):
should be implemented by users. SamplerPool will fork() N (N = num_worker)
child processes, and each process will perform worker() method independently.
Note that, the fork() API will use shared memory for N process and the OS will
perfrom copy-on-write only when developers write that piece of memory.
perfrom copy-on-write only when developers write that piece of memory. So fork N
processes and load N copy of graph will not increase the memory overhead.
Users can use this class like this:
Expand All @@ -36,7 +38,7 @@ def start(self, num_worker, args):
num_worker : int
number of worker (number of child process)
args : arguments
arguments passed by user
any arguments passed by user
"""
p = Pool()
for i in range(num_worker):
Expand All @@ -48,74 +50,84 @@ def start(self, num_worker, args):

@abstractmethod
def worker(self, args):
"""User-defined function
Parameters
----------
args : arguments
any arguments passed by user
"""
pass

class SamplerSender(object):
"""Sender of DGL distributed sampler.
"""SamplerSender for DGL distributed training.
Users use SamplerSender class to send sampled
subgraph (NodeFlow) to remote trainer. Note that, SamplerSender
class will try to connect to SamplerReceiver in a loop until the
SamplerReceiver started.
Users use SamplerSender to send sampled subgraph (NodeFlow)
to remote SamplerReceiver. Note that a SamplerSender can connect
to multiple SamplerReceiver.
Parameters
----------
ip : str
ip address of remote trainer machine
port : int
port of remote trainer machine
namebook : dict
address namebook of SamplerReceiver, where
key is recevier's ID and value is receiver's address, e.g.,
{ 0:'168.12.23.45:50051',
1:'168.12.23.21:50051',
2:'168.12.46.12:50051' }
"""
def __init__(self, ip, port):
self._ip = ip
self._port = port
self._sender = _create_sender(ip, port)
def __init__(self, namebook):
assert len(namebook) > 0, 'namebook cannot be empty.'
self._namebook = namebook
self._sender = _create_sender()
for ID, addr in self._namebook.items():
vec = addr.split(':')
_add_receiver_addr(self._sender, vec[0], int(vec[1]), ID)
_sender_connect(self._sender)

def __del__(self):
"""Finalize Sender
"""
# _finalize_sender will send a special message
# to tell the remote trainer machine that it has finished its job.
_finalize_sender(self._sender)

def send(self, nodeflow):
def send(self, nodeflow, recv_id):
"""Send sampled subgraph (NodeFlow) to remote trainer.
Parameters
----------
nodeflow : NodeFlow
sampled NodeFlow object
recv_id : int
receiver ID
"""
_send_subgraph(self._sender, nodeflow)
_send_nodeflow(self._sender, nodeflow, recv_id)

class SamplerReceiver(object):
"""Receiver of DGL distributed sampler.
"""SamplerReceiver for DGL distributed training.
Users use SamplerReceiver class to receive sampled
subgraph (NodeFlow) from remote samplers. Note that SamplerReceiver
can receive messages from multiple senders concurrently, by given
the num_sender parameter, and only when all senders connect to SamplerReceiver,
the SamplerReceiver can start its job.
Users use SamplerReceiver to receive sampled subgraph (NodeFlow)
from remote SamplerSender. Note that SamplerReceiver can receive messages
from multiple SamplerSenders concurrently by given the num_sender parameter.
Note that, only when all SamplerSenders connect to SamplerReceiver, receiver
can start its job.
Parameters
----------
ip : str
ip address of current trainer machine
port : int
port of current trainer machine
addr : str
address of SamplerReceiver, e.g., '127.0.0.1:50051'
num_sender : int
total number of sampler nodes, use 1 by default
total number of SamplerSender
"""
def __init__(self, ip, port, num_sender=1):
self._ip = ip
self._port = port
def __init__(self, addr, num_sender):
self._addr = addr
self._num_sender = num_sender
self._receiver = _create_receiver(ip, port, num_sender)
self._receiver = _create_receiver()
vec = self._addr.split(':')
_receiver_wait(self._receiver, vec[0], int(vec[1]), self._num_sender);

def __del__(self):
"""Finalize Receiver
_finalize_sampler_receiver method will clean up the
back-end threads started by the SamplerReceiver.
"""
_finalize_receiver(self._receiver)

Expand All @@ -132,4 +144,4 @@ def recv(self, graph):
NodeFlow
received NodeFlow object
"""
return _recv_subgraph(self._receiver, graph)
return _recv_nodeflow(self._receiver, graph)
103 changes: 63 additions & 40 deletions python/dgl/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,62 +8,105 @@

_init_api("dgl.network")

def _create_sender(ip_addr, port):
"""Create a sender communicator via C socket.
def _create_sender():
"""Create a Sender communicator via C api
"""
return _CAPI_DGLSenderCreate()

def _finalize_sender(sender):
"""Finalize Sender communicator
Parameters
----------
ip_addr : str
ip address of remote trainer
port : int
port of remote trainer
sender : ctypes.c_void_p
C Sender handle
"""
return _CAPI_DGLSenderCreate(ip_addr, port)
_CAPI_DGLFinalizeSender(sender)

def _create_receiver(ip_addr, port, num_sender):
"""Create a receiver communicator via C socket.
def _add_receiver_addr(sender, ip_addr, port, recv_id):
"""Add Receiver IP address to namebook
Parameters
----------
sender : ctypes.c_void_p
C Sender handle
ip_addr : str
ip address of remote trainer
IP address of Receiver
port : int
listen port of remote trainer
num_sender : int
total number of sampler nodes
listen of Receiver
recv_id : int
Receiver ID
"""
_CAPI_DGLSenderAddReceiver(sender, ip_addr, port, recv_id)

def _sender_connect(sender):
"""Connect to all the Receiver
Parameters
----------
sender : ctypes.c_void_p
C Sender handle
"""
return _CAPI_DGLReceiverCreate(ip_addr, port, num_sender)
_CAPI_DGLSenderConnect(sender)

def _send_subgraph(sender, nodeflow):
"""Send sampled subgraph (Nodeflow) to remote trainer.
def _send_nodeflow(sender, nodeflow, recv_id):
"""Send sampled subgraph (Nodeflow) to remote Receiver.
Parameters
----------
sender : ctypes.c_void_p
C sender handle
C Sender handle
nodeflow : NodeFlow
NodeFlow object
recv_id : int
Receiver ID
"""
graph_handle = nodeflow._graph._handle
node_mapping = nodeflow._node_mapping.todgltensor()
edge_mapping = nodeflow._edge_mapping.todgltensor()
# Can we convert NDArray to tensor directly, instead of using toindex()?
layers_offsets = utils.toindex(nodeflow._layer_offsets).todgltensor()
flows_offsets = utils.toindex(nodeflow._block_offsets).todgltensor()
_CAPI_SenderSendSubgraph(sender,
recv_id,
graph_handle,
node_mapping,
edge_mapping,
layers_offsets,
flows_offsets)

def _recv_subgraph(receiver, graph):
def _create_receiver():
"""Create a Receiver communicator via C api
"""
return _CAPI_DGLReceiverCreate()

def _finalize_receiver(receiver):
"""Finalize Receiver Communicator
"""
_CAPI_DGLFinalizeReceiver(receiver)

def _receiver_wait(receiver, ip_addr, port, num_sender):
"""Wait all Sender to connect..
Parameters
----------
receiver : ctypes.c_void_p
C Receiver handle
ip_addr : str
IP address of Receiver
port : int
port of Receiver
num_sender : int
total number of Sender
"""
_CAPI_DGLReceiverWait(receiver, ip_addr, port, num_sender)

def _recv_nodeflow(receiver, graph):
"""Receive sampled subgraph (NodeFlow) from remote sampler.
Parameters
----------
receiver : ctypes.c_void_p
C receiver handle
C Receiver handle
graph : DGLGraph
The parent graph
Expand All @@ -75,23 +118,3 @@ def _recv_subgraph(receiver, graph):
# hdl is a list of ptr
hdl = unwrap_to_ptr_list(_CAPI_ReceiverRecvSubgraph(receiver))
return NodeFlow(graph, hdl[0])

def _finalize_sender(sender):
"""Finalize Sender communicator
Parameters
----------
sender : ctypes.c_void_p
C sender handle
"""
_CAPI_DGLFinalizeCommunicator(sender)

def _finalize_receiver(receiver):
"""Finalize Receiver communicator
Parameters
----------
receiver : ctypes.c_void_p
C receiver handle
"""
_CAPI_DGLFinalizeCommunicator(receiver)
Loading

0 comments on commit b89dcce

Please sign in to comment.