Skip to content

Commit

Permalink
[KVStore] add init_data() on client (dmlc#1466)
Browse files Browse the repository at this point in the history
* add init_data on client

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* fix lint

* fix lint

* update

* update

* update

* update

* update

* update
  • Loading branch information
aksnzhy authored Apr 26, 2020
1 parent 0c0a897 commit 27520bc
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 28 deletions.
145 changes: 145 additions & 0 deletions python/dgl/contrib/dis_kvstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ def start(self):
name=str(client_id),
id=None,
data=None,
shape=None,
c_ptr=None)
_send_kv_msg(self._sender, msg, client_id)

Expand All @@ -435,6 +436,7 @@ def start(self):
name=shared_tensor,
id=None,
data=None,
shape=None,
c_ptr=None)

for client_id in range(len(self._client_namebook)):
Expand Down Expand Up @@ -471,8 +473,33 @@ def start(self):
name=msg.name,
id=msg.id,
data=res_tensor,
shape=None,
c_ptr=None)
_send_kv_msg(self._sender, back_msg, msg.rank)
# Init new data
elif msg.type == KVMsgType.INIT:
assert msg.rank == 0
data_str, target_name = msg.name.split('|')
data_name, data_type = self._deserialize_shared_tensor(data_str)
dtype = F.data_type_dict[data_type]
data_shape = F.asnumpy(msg.shape).tolist()
if self._server_id % self._group_count == 0: # master server
data_tensor = F.zeros(data_shape, dtype, F.cpu())
self.init_data(name=data_name, data_tensor=data_tensor)
else: # backup server
self.init_data(name=data_name)
g2l = self._data_store[target_name+'-g2l-']
self._data_store[data_name+'-g2l-'] = g2l
self._has_data.add(data_name+'-g2l-')
back_msg = KVStoreMsg(
type=KVMsgType.INIT,
rank=self._server_id,
name=msg.name,
id=None,
data=None,
shape=msg.shape,
c_ptr=None)
_send_kv_msg(self._sender, back_msg, 0)
# Barrier message
elif msg.type == KVMsgType.BARRIER:
self._barrier_count += 1
Expand All @@ -483,6 +510,7 @@ def start(self):
name=None,
id=None,
data=None,
shape=None,
c_ptr=None)
for client_id in range(self._client_count):
_send_kv_msg(self._sender, back_msg, client_id)
Expand Down Expand Up @@ -522,6 +550,28 @@ def _serialize_shared_tensor(self, name, dtype):
return str_data


def _deserialize_shared_tensor(self, data):
"""Deserialize shared tensor information sent from server
Parameters
----------
data : str
serialized string
Returns
-------
str
tensor name
str
data type
"""
data_list = data.split('/')
tensor_name = data_list[0]
data_type = data_list[-1]

return tensor_name, data_type


def _write_data_shape_type(self, filename, data):
"""Write data shape to a temp file.
Expand Down Expand Up @@ -720,6 +770,7 @@ def connect(self):
name=self._addr,
id=None,
data=None,
shape=None,
c_ptr=None)

for server_id in range(self._server_count):
Expand Down Expand Up @@ -757,6 +808,72 @@ def connect(self):
print("KVClient %d connect to kvstore successfully!" % self.get_id())


def init_data(self, name, shape, dtype, target_name):
"""Send message to kvserver to initialize new data and
get corresponded shared-tensor (e.g., partition_book, g2l) on kvclient.
The new data will be initialized to zeros.
Note that, this API must be invoked after the conenct() API.
Parameters
----------
name : str
data name
shape : list of int
data shape
dtype : dtype
data type
target_name : str
target name is used to find existing partition_book and g2l mapping.
"""
assert len(name) > 0, 'name cannot be empty.'
assert len(shape) > 0, 'shape cannot be empty.'
assert len(target_name) > 0, 'target_name cannot be empty.'

if self._client_id == 0: # only client_0 send message to server
partition_book = self._data_store[target_name+'-part-']
machines, count = np.unique(F.asnumpy(partition_book), return_counts=True)
assert shape[0] == len(partition_book)
# send message to all of the server nodes
for idx in range(len(machines)):
m_id = machines[idx]
data_str = self._serialize_shared_tensor(name, dtype)
data_str = data_str + '|' + target_name
partitioned_shape = shape.copy()
partitioned_shape[0] = count[idx]
for n in range(self._group_count):
server_id = m_id * self._group_count + n
msg = KVStoreMsg(
type=KVMsgType.INIT,
rank=0,
name=data_str,
id=None,
data=None,
shape=F.tensor(partitioned_shape),
c_ptr=None)
_send_kv_msg(self._sender, msg, server_id)
# recv confirmation message from server nodes
for server_id in range(self._server_count):
msg = _recv_kv_msg(self._receiver)
assert msg.type == KVMsgType.INIT
self.barrier() # wait all the client and server finish its job
g2l = self._data_store[target_name+'-g2l-']
partition_book = self._data_store[target_name+'-part-']
self._data_store[name+'-g2l-'] = g2l
self._data_store[name+'-part-'] = partition_book
self._has_data.add(name+'-g2l-')
self._has_data.add(name+'-part-')
# Read new data from shared-memory created by server
shape, data_type = self._read_data_shape_type(name+'-data-shape-'+str(self._machine_id))
assert data_type == get_type_str(dtype)
shared_data = empty_shared_mem(name+'-data-', False, shape, data_type)
dlpack = shared_data.to_dlpack()
self._data_store[name+'-data-'] = F.zerocopy_from_dlpack(dlpack)
self._has_data.add(name+'-data-')
self._data_name_list.append(name)


def print(self):
"""Print client information (Used by debug)
"""
Expand Down Expand Up @@ -886,6 +1003,7 @@ def push(self, name, id_tensor, data_tensor):
name=name,
id=partial_id,
data=partial_data,
shape=None,
c_ptr=None)
# randomly select a server node in target machine for load-balance
s_id = random.randint(machine[idx]*self._group_count, (machine[idx]+1)*self._group_count-1)
Expand Down Expand Up @@ -967,6 +1085,7 @@ def pull(self, name, id_tensor):
name=name,
id=partial_id,
data=None,
shape=None,
c_ptr=None)
# randomly select a server node in target machine for load-balance
s_id = random.randint(machine[idx]*self._group_count, (machine[idx]+1)*self._group_count-1)
Expand All @@ -985,6 +1104,7 @@ def pull(self, name, id_tensor):
name=name,
id=None,
data=local_data,
shape=None,
c_ptr=None)
msg_list.append(local_msg)
self._garbage_msg.append(local_msg)
Expand Down Expand Up @@ -1013,6 +1133,7 @@ def barrier(self):
name=None,
id=None,
data=None,
shape=None,
c_ptr=None)

for server_id in range(self._server_count):
Expand All @@ -1035,6 +1156,7 @@ def shut_down(self):
name=None,
id=None,
data=None,
shape=None,
c_ptr=None)
_send_kv_msg(self._sender, msg, server_id)

Expand Down Expand Up @@ -1102,6 +1224,29 @@ def _local_ip4_addr_list(self):
return nic


def _serialize_shared_tensor(self, name, dtype):
"""Serialize shared tensor information.
Parameters
----------
name : str
tensor name
dtype : dtype
data type
Returns
-------
str
serialized string
"""
assert len(name) > 0, 'data name cannot be empty.'

str_data = name
str_data += '/'
str_data += get_type_str(dtype)
return str_data


def _deserialize_shared_tensor(self, data):
"""Deserialize shared tensor information sent from server
Expand Down
27 changes: 26 additions & 1 deletion python/dgl/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class KVMsgType(Enum):
IP_ID = 7


KVStoreMsg = namedtuple("KVStoreMsg", "type rank name id data c_ptr")
KVStoreMsg = namedtuple("KVStoreMsg", "type rank name id data shape c_ptr")
"""Message of DGL kvstore
Data Field
Expand Down Expand Up @@ -234,6 +234,15 @@ def _send_kv_msg(sender, msg, recv_id):
msg.rank,
msg.name,
tensor_id)
elif msg.type == KVMsgType.INIT:
tensor_shape = F.zerocopy_to_dgl_ndarray(msg.shape)
_CAPI_SenderSendKVMsg(
sender,
int(recv_id),
msg.type.value,
msg.rank,
msg.name,
tensor_shape)
elif msg.type == KVMsgType.IP_ID:
_CAPI_SenderSendKVMsg(
sender,
Expand Down Expand Up @@ -284,6 +293,19 @@ def _recv_kv_msg(receiver):
name=name,
id=tensor_id,
data=None,
shape=None,
c_ptr=msg_ptr)
return msg
elif msg_type == KVMsgType.INIT:
name = _CAPI_ReceiverGetKVMsgName(msg_ptr)
tensor_shape = F.zerocopy_from_dgl_ndarray(_CAPI_ReceiverGetKVMsgShape(msg_ptr))
msg = KVStoreMsg(
type=msg_type,
rank=rank,
name=name,
id=None,
data=None,
shape=tensor_shape,
c_ptr=msg_ptr)
return msg
elif msg_type == KVMsgType.IP_ID:
Expand All @@ -294,6 +316,7 @@ def _recv_kv_msg(receiver):
name=name,
id=None,
data=None,
shape=None,
c_ptr=msg_ptr)
return msg
elif msg_type in (KVMsgType.FINAL, KVMsgType.BARRIER):
Expand All @@ -303,6 +326,7 @@ def _recv_kv_msg(receiver):
name=None,
id=None,
data=None,
shape=None,
c_ptr=msg_ptr)
return msg
else:
Expand All @@ -315,6 +339,7 @@ def _recv_kv_msg(receiver):
name=name,
id=tensor_id,
data=data,
shape=None,
c_ptr=msg_ptr)
return msg

Expand Down
Loading

0 comments on commit 27520bc

Please sign in to comment.