[Feature] k-hop Subgraph Extraction (dmlc#3458)

* Update * Fix * Fix * Update * Update * Update * Fix CI * Fix * Fix * Fix * Update * Update * Update * Fix * Fix * Fix for TF
standbyme · Nov 4, 2021 · f46080a · f46080a
1 parent 64f20ee
commit f46080a
Show file tree

Hide file tree

Showing 3 changed files with 406 additions and 1 deletion.
diff --git a/docs/source/api/python/dgl.rst b/docs/source/api/python/dgl.rst
@@ -45,6 +45,8 @@ Operators for extracting and returning subgraphs.
     edge_type_subgraph
     in_subgraph
     out_subgraph
+    khop_in_subgraph
+    khop_out_subgraph
 
 .. _api-transform:
 

diff --git a/python/dgl/subgraph.py b/python/dgl/subgraph.py
@@ -15,7 +15,7 @@
 from . import utils
 
 __all__ = ['node_subgraph', 'edge_subgraph', 'node_type_subgraph', 'edge_type_subgraph',
-           'in_subgraph', 'out_subgraph']
+           'in_subgraph', 'out_subgraph', 'khop_in_subgraph', 'khop_out_subgraph']
 
 def node_subgraph(graph, nodes, *, relabel_nodes=True, store_ids=True):
     """Return a subgraph induced on the given nodes.
@@ -552,6 +552,271 @@ def out_subgraph(graph, nodes, *, relabel_nodes=False, store_ids=True):
 
 DGLHeteroGraph.out_subgraph = utils.alias_func(out_subgraph)
 
+def khop_in_subgraph(graph, nodes, k, *, relabel_nodes=True, store_ids=True):
+    """Return the subgraph induced by k-hop in-neighborhood of the specified node(s).
+
+    We can expand a set of nodes by including the predecessors of them. From a
+    specified node set, a k-hop in subgraph is obtained by first repeating the node set
+    expansion for k times and then creating a node induced subgraph. In addition to
+    extracting the subgraph, DGL also copies the features of the extracted nodes and
+    edges to the resulting graph. The copy is *lazy* and incurs data movement only
+    when needed.
+
+    If the graph is heterogeneous, DGL extracts a subgraph per relation and composes
+    them as the resulting graph. Thus the resulting graph has the same set of relations
+    as the input one.
+
+    Parameters
+    ----------
+    graph : DGLGraph
+        The input graph.
+    nodes : nodes or dict[str, nodes]
+        The starting node(s) to expand. The allowed formats are:
+
+        * Int: ID of a single node.
+        * Int Tensor: Each element is a node ID. The tensor must have the same device
+          type and ID data type as the graph's.
+        * iterable[int]: Each element is a node ID.
+
+        If the graph is homogeneous, one can directly pass the above formats.
+        Otherwise, the argument must be a dictionary with keys being node types
+        and values being the node IDs in the above formats.
+    k : int
+        The number of hops.
+    relabel_nodes : bool, optional
+        If True, it will remove the isolated nodes and relabel the rest nodes in the
+        extracted subgraph.
+    store_ids : bool, optional
+        If True, it will store the raw IDs of the extracted edges in the ``edata`` of the
+        resulting graph under name ``dgl.EID``; if ``relabel_nodes`` is ``True``, it will
+        also store the raw IDs of the extracted nodes in the ``ndata`` of the resulting
+        graph under name ``dgl.NID``.
+
+    Returns
+    -------
+    G : DGLGraph
+        The subgraph.
+
+    Notes
+    -----
+
+    When k is 1, the result subgraph is different from the one obtained by
+    :func:`dgl.in_subgraph`. The 1-hop in subgraph also includes the edges
+    among the neighborhood.
+
+    Examples
+    --------
+    The following example uses PyTorch backend.
+
+    >>> import dgl
+    >>> import torch
+
+    Extract a two-hop subgraph from a homogeneous graph.
+
+    >>> g = dgl.graph(([1, 1, 2, 3, 4], [0, 2, 0, 4, 2]))
+    >>> g.edata['w'] = torch.arange(10).view(5, 2)
+    >>> sg = dgl.khop_in_subgraph(g, 0, k=2)
+    >>> sg
+    Graph(num_nodes=4, num_edges=4,
+          ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'w': Scheme(shape=(2,), dtype=torch.int64),
+                         '_ID': Scheme(shape=(), dtype=torch.int64)})
+    >>> sg.edges()
+    (tensor([1, 1, 2, 3]), tensor([0, 2, 0, 2]))
+    >>> sg.edata[dgl.EID]  # original edge IDs
+    tensor([0, 1, 2, 4])
+    >>> sg.edata['w']  # also extract the features
+    tensor([[0, 1],
+            [2, 3],
+            [4, 5],
+            [8, 9]])
+
+    Extract a subgraph from a heterogeneous graph.
+
+    >>> g = dgl.heterograph({
+    ...     ('user', 'plays', 'game'): ([0, 1, 1, 2], [0, 0, 2, 1]),
+    ...     ('user', 'follows', 'user'): ([0, 1, 1], [1, 2, 2])})
+    >>> sg = dgl.khop_in_subgraph(g, {'game': 0}, k=2)
+    >>> sg
+    Graph(num_nodes={'game': 1, 'user': 2},
+          num_edges={('user', 'follows', 'user'): 1, ('user', 'plays', 'game'): 2},
+          metagraph=[('user', 'user', 'follows'), ('user', 'game', 'plays')])
+
+    See also
+    --------
+    khop_out_subgraph
+    """
+    if graph.is_block:
+        raise DGLError('Extracting subgraph of a block graph is not allowed.')
+
+    if not isinstance(nodes, Mapping):
+        assert len(graph.ntypes) == 1, \
+            'need a dict of node type and IDs for graph with multiple node types'
+        nodes = {graph.ntypes[0]: nodes}
+
+    for nty, nty_nodes in nodes.items():
+        nodes[nty] = utils.prepare_tensor(graph, nty_nodes, 'nodes["{}"]'.format(nty))
+
+    last_hop_nodes = nodes
+    k_hop_nodes_ = [last_hop_nodes]
+    place_holder = F.copy_to(F.tensor([], dtype=graph.idtype), graph.device)
+    for _ in range(k):
+        current_hop_nodes = {nty: [] for nty in graph.ntypes}
+        for cetype in graph.canonical_etypes:
+            srctype, _, dsttype = cetype
+            in_nbrs, _ = graph.in_edges(last_hop_nodes.get(dsttype, place_holder), etype=cetype)
+            current_hop_nodes[srctype].append(in_nbrs)
+        for nty in graph.ntypes:
+            if len(current_hop_nodes[nty]) == 0:
+                current_hop_nodes[nty] = place_holder
+                continue
+            current_hop_nodes[nty] = F.unique(F.cat(current_hop_nodes[nty], dim=0))
+        k_hop_nodes_.append(current_hop_nodes)
+        last_hop_nodes = current_hop_nodes
+
+    k_hop_nodes = dict()
+    for nty in graph.ntypes:
+        k_hop_nodes[nty] = F.unique(F.cat([
+            hop_nodes.get(nty, place_holder)
+            for hop_nodes in k_hop_nodes_], dim=0))
+
+    return node_subgraph(graph, k_hop_nodes, relabel_nodes=relabel_nodes, store_ids=store_ids)
+
+DGLHeteroGraph.khop_in_subgraph = utils.alias_func(khop_in_subgraph)
+
+def khop_out_subgraph(graph, nodes, k, *, relabel_nodes=True, store_ids=True):
+    """Return the subgraph induced by k-hop out-neighborhood of the specified node(s).
+
+    We can expand a set of nodes by including the successors of them. From a
+    specified node set, a k-hop out subgraph is obtained by first repeating the node set
+    expansion for k times and then creating a node induced subgraph. In addition to
+    extracting the subgraph, DGL also copies the features of the extracted nodes and
+    edges to the resulting graph. The copy is *lazy* and incurs data movement only
+    when needed.
+
+    If the graph is heterogeneous, DGL extracts a subgraph per relation and composes
+    them as the resulting graph. Thus the resulting graph has the same set of relations
+    as the input one.
+
+    Parameters
+    ----------
+    graph : DGLGraph
+        The input graph.
+    nodes : nodes or dict[str, nodes]
+        The starting node(s) to expand. The allowed formats are:
+
+        * Int: ID of a single node.
+        * Int Tensor: Each element is a node ID. The tensor must have the same device
+          type and ID data type as the graph's.
+        * iterable[int]: Each element is a node ID.
+
+        If the graph is homogeneous, one can directly pass the above formats.
+        Otherwise, the argument must be a dictionary with keys being node types
+        and values being the node IDs in the above formats.
+    k : int
+        The number of hops.
+    relabel_nodes : bool, optional
+        If True, it will remove the isolated nodes and relabel the rest nodes in the
+        extracted subgraph.
+    store_ids : bool, optional
+        If True, it will store the raw IDs of the extracted edges in the ``edata`` of the
+        resulting graph under name ``dgl.EID``; if ``relabel_nodes`` is ``True``, it will
+        also store the raw IDs of the extracted nodes in the ``ndata`` of the resulting
+        graph under name ``dgl.NID``.
+
+    Returns
+    -------
+    G : DGLGraph
+        The subgraph.
+
+    Notes
+    -----
+
+    When k is 1, the result subgraph is different from the one obtained by
+    :func:`dgl.out_subgraph`. The 1-hop out subgraph also includes the edges
+    among the neighborhood.
+
+    Examples
+    --------
+    The following example uses PyTorch backend.
+
+    >>> import dgl
+    >>> import torch
+
+    Extract a two-hop subgraph from a homogeneous graph.
+
+    >>> g = dgl.graph(([0, 2, 0, 4, 2], [1, 1, 2, 3, 4]))
+    >>> g.edata['w'] = torch.arange(10).view(5, 2)
+    >>> sg = dgl.khop_out_subgraph(g, 0, k=2)
+    >>> sg
+    Graph(num_nodes=4, num_edges=4,
+          ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'w': Scheme(shape=(2,), dtype=torch.int64),
+                         '_ID': Scheme(shape=(), dtype=torch.int64)})
+    >>> sg.edges()
+    (tensor([0, 0, 2, 2]), tensor([1, 2, 1, 3]))
+    >>> sg.edata[dgl.EID]  # original edge IDs
+    tensor([0, 2, 1, 4])
+    >>> sg.edata['w']  # also extract the features
+    tensor([[0, 1],
+            [4, 5],
+            [2, 3],
+            [8, 9]])
+
+    Extract a subgraph from a heterogeneous graph.
+
+    >>> g = dgl.heterograph({
+    ...     ('user', 'plays', 'game'): ([0, 1, 1, 2], [0, 0, 2, 1]),
+    ...     ('user', 'follows', 'user'): ([0, 1], [1, 3])})
+    >>> sg = dgl.khop_out_subgraph(g, {'user': 0}, k=2)
+    >>> sg
+    Graph(num_nodes={'game': 2, 'user': 3},
+          num_edges={('user', 'follows', 'user'): 2, ('user', 'plays', 'game'): 2},
+          metagraph=[('user', 'user', 'follows'), ('user', 'game', 'plays')])
+
+    See also
+    --------
+    khop_in_subgraph
+    """
+    if graph.is_block:
+        raise DGLError('Extracting subgraph of a block graph is not allowed.')
+
+    if not isinstance(nodes, Mapping):
+        assert len(graph.ntypes) == 1, \
+            'need a dict of node type and IDs for graph with multiple node types'
+        nodes = {graph.ntypes[0]: nodes}
+
+    for nty, nty_nodes in nodes.items():
+        nodes[nty] = utils.prepare_tensor(graph, nty_nodes, 'nodes["{}"]'.format(nty))
+
+    last_hop_nodes = nodes
+    k_hop_nodes_ = [last_hop_nodes]
+    place_holder = F.copy_to(F.tensor([], dtype=graph.idtype), graph.device)
+    for _ in range(k):
+        current_hop_nodes = {nty: [] for nty in graph.ntypes}
+        for cetype in graph.canonical_etypes:
+            srctype, _, dsttype = cetype
+            _, out_nbrs = graph.out_edges(last_hop_nodes.get(
+                srctype, place_holder), etype=cetype)
+            current_hop_nodes[dsttype].append(out_nbrs)
+        for nty in graph.ntypes:
+            if len(current_hop_nodes[nty]) == 0:
+                current_hop_nodes[nty] = place_holder
+                continue
+            current_hop_nodes[nty] = F.unique(F.cat(current_hop_nodes[nty], dim=0))
+        k_hop_nodes_.append(current_hop_nodes)
+        last_hop_nodes = current_hop_nodes
+
+    k_hop_nodes = dict()
+    for nty in graph.ntypes:
+        k_hop_nodes[nty] = F.unique(F.cat([
+            hop_nodes.get(nty, place_holder)
+            for hop_nodes in k_hop_nodes_], dim=0))
+
+    return node_subgraph(graph, k_hop_nodes, relabel_nodes=relabel_nodes, store_ids=store_ids)
+
+DGLHeteroGraph.khop_out_subgraph = utils.alias_func(khop_out_subgraph)
+
 def node_type_subgraph(graph, ntypes):
     """Return the subgraph induced on given node types.