From 0fc649523bf56c483f5e3953521f83890ed2b615 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Fri, 19 Feb 2021 10:48:05 -0800 Subject: [PATCH] [Doc] Distributed heterogeneous graph training (#2654) * update DistGraph docstrings. * add user guide. * add doc string. * fix. * fix. * fix. Co-authored-by: Jinjing Zhou --- docs/source/api/python/dgl.distributed.rst | 4 +- docs/source/guide/distributed-hetero.rst | 93 +++++++++++++++++++ docs/source/guide/distributed-tools.rst | 2 +- docs/source/guide/distributed.rst | 2 + python/dgl/distributed/dist_tensor.py | 12 ++- .../dgl/distributed/graph_partition_book.py | 60 ++++++++++++ 6 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 docs/source/guide/distributed-hetero.rst diff --git a/docs/source/api/python/dgl.distributed.rst b/docs/source/api/python/dgl.distributed.rst index b860caab86dd..ec2f6ca7f710 100644 --- a/docs/source/api/python/dgl.distributed.rst +++ b/docs/source/api/python/dgl.distributed.rst @@ -17,7 +17,7 @@ Distributed Graph ----------------- .. autoclass:: DistGraph - :members: ndata, edata, idtype, device, ntypes, etypes, number_of_nodes, number_of_edges, node_attr_schemes, edge_attr_schemes, rank, find_edges, get_partition_book, barrier, local_partition, num_nodes, num_edges + :members: ndata, edata, idtype, device, ntypes, etypes, number_of_nodes, number_of_edges, node_attr_schemes, edge_attr_schemes, rank, find_edges, get_partition_book, barrier, local_partition, num_nodes, num_edges, get_node_partition_policy, get_edge_partition_policy, get_etype_id, get_ntype_id, nodes, edges Distributed Tensor ------------------ @@ -73,7 +73,7 @@ Graph partition book .. currentmodule:: dgl.distributed.graph_partition_book .. autoclass:: GraphPartitionBook - :members: shared_memory, num_partitions, metadata, nid2partid, eid2partid, partid2nids, partid2eids, nid2localnid, eid2localeid, partid + :members: shared_memory, num_partitions, metadata, nid2partid, eid2partid, partid2nids, partid2eids, nid2localnid, eid2localeid, partid, map_to_per_ntype, map_to_per_etype, map_to_homo_nid, map_to_homo_eid .. autoclass:: PartitionPolicy :members: policy_str, part_id, partition_book, to_local, to_partid, get_part_size, get_size diff --git a/docs/source/guide/distributed-hetero.rst b/docs/source/guide/distributed-hetero.rst new file mode 100644 index 000000000000..844ae8c5c107 --- /dev/null +++ b/docs/source/guide/distributed-hetero.rst @@ -0,0 +1,93 @@ +.. _guide-distributed-hetero: + +7.3 Distributed Heterogeneous graph training +-------------------------------------------- + +DGL v0.6.0 provides an experimental support for distributed training on heterogeneous graphs. +In DGL, a node or edge in a heterogeneous graph has a unique ID in its own node type or edge type. +DGL identifies a node or edge with a tuple: node/edge type and type-wise ID. In distributed training, +a node or edge can be identified by a homogeneous ID, in addition to the tuple of node/edge type +and type-wise ID. The homogeneous ID is unique regardless of the node type and edge type. +DGL arranges nodes and edges so that all nodes of the same type have contiguous +homogeneous IDs. + +Below is an example adjancency matrix of a heterogeneous graph showing the homogeneous ID assignment. +Here, the graph has two types of nodes (`T0` and `T1` ), and four types of edges (`R0`, `R1`, `R2`, `R3` ). +There are a total of 400 nodes in the graph and each type has 200 nodes. Nodes +of `T0` have IDs in [0,200), while nodes of `T1` have IDs in [200, 400). +In this example, if we use a tuple to identify the nodes, nodes of `T0` are identified as +(T0, type-wise ID), where type-wise ID falls in [0, 200); nodes of `T1` are identified as +(T1, type-wise ID), where type-wise ID also falls in [0, 200). + +.. figure:: https://data.dgl.ai/tutorial/hetero/heterograph_ids.png + :alt: Imgur + +7.3.1 Access distributed graph data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For distributed training, :class:`~dgl.distributed.DistGraph` supports the heterogeneous graph API +in :class:`~dgl.DGLGraph`. Below shows an example of getting node data of `T0` on some nodes +by using type-wise node IDs. When accessing data in :class:`~dgl.distributed.DistGraph`, a user +needs to use type-wise IDs and corresponding node types or edge types. + +.. code:: python + + import dgl + g = dgl.distributed.DistGraph('graph_name', part_config='data/graph_name.json') + feat = g.nodes['T0'].data['feat'][type_wise_ids] + +A user can create distributed tensors and distributed embeddings for a particular node type or +edge type. Distributed tensors and embeddings are split and stored in multiple machines. To create +one, a user needs to specify how it is partitioned with :class:`~dgl.distributed.PartitionPolicy`. +By default, DGL chooses the right partition policy based on the size of the first dimension. +However, if multiple node types or edge types have the same number of nodes or edges, DGL cannot +determine the partition policy automatically. A user needs to explicitly specify the partition policy. +Below shows an example of creating a distributed tensor for node type `T0` by using the partition policy +for `T0` and store it as node data of `T0`. + +.. code:: python + + g.nodes['T0'].data['feat1'] = dgl.distributed.DistTensor((g.number_of_nodes('T0'), 1), th.float32, 'feat1', + part_policy=g.get_node_partition_policy('T0')) + +The partition policies used for creating distributed tensors and embeddings are initialized when a heterogeneous +graph is loaded into the graph server. A user cannot create a new partition policy at runtime. Therefore, a user +can only create distributed tensors or embeddings for a node type or edge type. +Accessing distributed tensors and embeddings also requires type-wise IDs. + +7.3.2 Distributed sampling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +DGL v0.6 uses homogeneous IDs in distributed sampling. **Note**: this may change in the future release. +DGL provides four APIs to convert node IDs and edge IDs between the homogeneous IDs and type-wise IDs: + +* :func:`~dgl.distributed.GraphPartitionBook.map_to_per_ntype`: convert a homogeneous node ID to type-wise ID and node type ID. +* :func:`~dgl.distributed.GraphPartitionBook.map_to_per_etype`: convert a homogeneous edge ID to type-wise ID and edge type ID. +* :func:`~dgl.distributed.GraphPartitionBook.map_to_homo_nid`: convert type-wise ID and node type to a homogeneous node ID. +* :func:`~dgl.distributed.GraphPartitionBook.map_to_homo_eid`: convert type-wise ID and edge type to a homogeneous edge ID. + +Below shows an example of sampling a subgraph with :func:`~dgl.distributed.sample_neighbors` from a heterogeneous graph +with a node type called `paper`. It first converts type-wise node IDs to homogeneous node IDs. After sampling a subgraph +from the seed nodes, it converts homogeneous node IDs and edge IDs to type-wise IDs and also stores type IDs as node data +and edge data. + +.. code:: python + + gpb = g.get_partition_book() + # We need to map the type-wise node IDs to homogeneous IDs. + cur = gpb.map_to_homo_nid(seeds, 'paper') + # For a heterogeneous input graph, the returned frontier is stored in + # the homogeneous graph format. + frontier = dgl.distributed.sample_neighbors(g, cur, fanout, replace=False) + block = dgl.to_block(frontier, cur) + cur = block.srcdata[dgl.NID] + + block.edata[dgl.EID] = frontier.edata[dgl.EID] + # Map the homogeneous edge Ids to their edge type. + block.edata[dgl.ETYPE], block.edata[dgl.EID] = gpb.map_to_per_etype(block.edata[dgl.EID]) + # Map the homogeneous node Ids to their node types and per-type Ids. + block.srcdata[dgl.NTYPE], block.srcdata[dgl.NID] = gpb.map_to_per_ntype(block.srcdata[dgl.NID]) + block.dstdata[dgl.NTYPE], block.dstdata[dgl.NID] = gpb.map_to_per_ntype(block.dstdata[dgl.NID]) + +From node/edge type IDs, a user can retrieve node/edge types. For example, `g.ntypes[node_type_id]`. +With node/edge types and type-wise IDs, a user can retrieve node/edge data from `DistGraph` for mini-batch computation. diff --git a/docs/source/guide/distributed-tools.rst b/docs/source/guide/distributed-tools.rst index c52b516eef0d..2b0e9899bc24 100644 --- a/docs/source/guide/distributed-tools.rst +++ b/docs/source/guide/distributed-tools.rst @@ -1,6 +1,6 @@ .. _guide-distributed-tools: -7.3 Tools for launching distributed training/inference +7.4 Tools for launching distributed training/inference ------------------------------------------------------ :ref:`(中文版) ` diff --git a/docs/source/guide/distributed.rst b/docs/source/guide/distributed.rst index cb66ce53a1ec..5cca0818e26b 100644 --- a/docs/source/guide/distributed.rst +++ b/docs/source/guide/distributed.rst @@ -98,6 +98,7 @@ the following distributed components: * :ref:`guide-distributed-preprocessing` * :ref:`guide-distributed-apis` +* :ref:`guide-distributed-hetero` * :ref:`guide-distributed-tools` .. toctree:: @@ -107,4 +108,5 @@ the following distributed components: distributed-preprocessing distributed-apis + distributed-hetero distributed-tools diff --git a/python/dgl/distributed/dist_tensor.py b/python/dgl/distributed/dist_tensor.py index 5492a8f974af..f485395677e8 100644 --- a/python/dgl/distributed/dist_tensor.py +++ b/python/dgl/distributed/dist_tensor.py @@ -27,11 +27,13 @@ class DistTensor: graph. Therefore, their first dimensions have to be the number of nodes or edges in the graph. The tensors are sharded in the first dimension based on the partition policy of nodes or edges. When a distributed tensor is created, the partition policy is automatically - determined based on the first dimension if the partition policy is not provided: if the first - dimension matches the number of nodes, ``DistTensor`` will use the node partition policy; - if the first dimension matches the number of edges, ``DistTensor`` wll use the edge partition - policy. To determine the partition policy automatically, a DistGraph object has to be created. - Users can overwrite the rule by providing a partition policy directly. + determined based on the first dimension if the partition policy is not provided. If the first + dimension matches the number of nodes of a node type, ``DistTensor`` will use the partition + policy for this particular node type; if the first dimension matches the number of edges of + an edge type, ``DistTensor`` will use the partition policy for this particular edge type. + If DGL cannot determine the partition policy automatically (e.g., multiple node types or + edge types have the same number of nodes or edges), users have to explicity provide + the partition policy. A distributed tensor can be ether named or anonymous. When a distributed tensor has a name, the tensor can be persistent if ``persistent=True``. diff --git a/python/dgl/distributed/graph_partition_book.py b/python/dgl/distributed/graph_partition_book.py index 759d76cf4aeb..db9d0d9088f1 100644 --- a/python/dgl/distributed/graph_partition_book.py +++ b/python/dgl/distributed/graph_partition_book.py @@ -334,6 +334,66 @@ def etypes(self): """Get the list of edge types """ + def map_to_per_ntype(self, ids): + """Map homogeneous node IDs to type-wise IDs and node types. + + Parameters + ---------- + ids : tensor + Homogeneous node IDs. + + Returns + ------- + (tensor, tensor) + node type IDs and type-wise node IDs. + """ + + def map_to_per_etype(self, ids): + """Map homogeneous edge IDs to type-wise IDs and edge types. + + Parameters + ---------- + ids : tensor + Homogeneous edge IDs. + + Returns + ------- + (tensor, tensor) + edge type IDs and type-wise edge IDs. + """ + + def map_to_homo_nid(self, ids, ntype): + """Map type-wise node IDs and type IDs to homogeneous node IDs. + + Parameters + ---------- + ids : tensor + Type-wise node Ids + ntype : str + node type + + Returns + ------- + Tensor + Homogeneous node IDs. + """ + + def map_to_homo_eid(self, ids, etype): + """Map type-wise edge IDs and type IDs to homogeneous edge IDs. + + Parameters + ---------- + ids : tensor + Type-wise edge Ids + etype : str + edge type + + Returns + ------- + Tensor + Homogeneous edge IDs. + """ + class BasicPartitionBook(GraphPartitionBook): """This provides the most flexible way to store parition information.