[Feature] Biased Neighbor Sampling (dmlc#2987)

* update * update * update * update * lint * lint * update * update * update * update * update * update * update * update * update * update * update * update * update * update * lint * update * clone * update * update * update * update * replace idarray with ndarray * refactor cpp part * refactor python part * debug * refactor interface * test and doc * lint and test * lint * fix * fix * fix * const * doc * fix * fix * fix * fix * fix & doc * fix * fix * update * update * update * merge * doc * doc * lint * fix * more tests * doc * fix * fix * update * update * update * fix * fix Co-authored-by: Minjie Wang <[email protected]>
lizeyan · Jun 23, 2021 · e56bbaf · e56bbaf
1 parent 7415eaa
commit e56bbaf
Show file tree

Hide file tree

Showing 9 changed files with 572 additions and 1 deletion.
diff --git a/include/dgl/aten/csr.h b/include/dgl/aten/csr.h
@@ -430,6 +430,71 @@ COOMatrix CSRRowWiseTopk(
     FloatArray weight,
     bool ascending = false);
 
+
+
+/*!
+ * \brief Randomly select a fixed number of non-zero entries along each given row independently,
+ *        where the probability of columns to be picked can be biased according to its tag.
+ *
+ * Each column is assigned an integer tag which determines its probability to be sampled.
+ * Users can assign different probability to different tags.
+ *
+ * This function only works with a CSR matrix sorted according to the tag so that entries with
+ * the same column tag are arranged in a consecutive range, and the input `tag_offset` represents
+ * the boundaries of these ranges. However, the function itself will not check if the input matrix
+ * has been sorted. It's the caller's responsibility to ensure the input matrix has been sorted
+ * by `CSRSortByTag` (it will also return a NDArray `tag_offset` which should be used as an input
+ * of this function).
+ *
+ * The picked indices are returned in the form of a COO matrix.
+ *
+ * If replace is false and a row has fewer non-zero values than num_samples,
+ * all the values are picked.
+ *
+ * Examples:
+ *
+ * // csr.num_rows = 4;
+ * // csr.num_cols = 4;
+ * // csr.indptr = [0, 2, 4, 5, 5]
+ * // csr.indices =                [1, 2, 2, 3, 3]
+ * // tag of each element's column: 0, 0, 0, 1, 1
+ * // tag_offset = [[0, 2, 2], [0, 1, 2], [0, 0, 1]]
+ * // csr.data = [2, 3, 0, 1, 4]
+ * // bias = [1.0, 0.0]
+ * CSRMatrix mat = ...;
+ * IdArray rows = ...; //[0, 1]
+ * NDArray tag_offset = ...;
+ * FloatArray bias = ...;
+ * COOMatrix sampled = CSRRowWiseSamplingBiased(mat, rows, 1, bias);
+ * // possible sampled coo matrix:
+ * // sampled.num_rows = 4
+ * // sampled.num_cols = 4
+ * // sampled.rows = [0, 1]
+ * // sampled.cols = [1, 2]
+ * // sampled.data = [2, 0]
+ * // Note that in this case, for row 1, the column 3 will never be picked as it has tag 1 and the
+ * // probability of tag 1 is 0.
+ *
+ *
+ * \param mat Input CSR matrix.
+ * \param rows Rows to sample from.
+ * \param num_samples Number of samples.
+ * \param tag_offset The boundaries of tags. Should be of the shape [num_row, num_tags+1]
+ * \param bias Unnormalized probability array. Should be of length num_tags
+ * \param replace True if sample with replacement
+ * \return A COOMatrix storing the picked row and col indices. Its data field stores the
+ *         the index of the picked elements in the value array.
+ *
+ */
+COOMatrix CSRRowWiseSamplingBiased(
+    CSRMatrix mat,
+    IdArray rows,
+    int64_t num_samples,
+    NDArray tag_offset,
+    FloatArray bias,
+    bool replace = true
+);
+
 /*!
  * \brief Sort the column index according to the tag of each column.
  *

diff --git a/include/dgl/sampling/neighbor.h b/include/dgl/sampling/neighbor.h
@@ -69,6 +69,15 @@ HeteroSubgraph SampleNeighborsTopk(
     const std::vector<FloatArray>& weight,
     bool ascending = false);
 
+HeteroSubgraph SampleNeighborsBiased(
+    const HeteroGraphPtr hg,
+    const IdArray& nodes,
+    const int64_t fanouts,
+    const NDArray& bias,
+    const NDArray& tag_offset,
+    const EdgeDir dir,
+    const bool replace
+);
 }  // namespace sampling
 }  // namespace dgl
 

diff --git a/python/dgl/sampling/neighbor.py b/python/dgl/sampling/neighbor.py
@@ -9,6 +9,7 @@
 
 __all__ = [
     'sample_neighbors',
+    'sample_neighbors_biased',
     'select_topk']
 
 def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
@@ -179,6 +180,163 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
 
     return ret
 
+def sample_neighbors_biased(g, nodes, fanout, bias, edge_dir='in',
+                            tag_offset_name='_TAG_OFFSET', replace=False,
+                            copy_ndata=True, copy_edata=True):
+    """Sample neighboring edges of the given nodes and return the induced subgraph, where each
+       neighbor's probability to be picked is determined by its tag.
+
+    For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
+    will be randomly chosen.  The graph returned will then contain all the nodes in the
+    original graph, but only the sampled edges.
+
+    This version of neighbor sampling can support the scenario where adjacent nodes with different
+    types might have different probability to be picked. Each node is assigned an integer(tag)
+    which represents its type. Tag is an analogue of node type under the framework of homogeneous
+    graphs. Nodes with the same tag share the same probability.
+
+    For example, assume a node has (a+b) neighbors, and a of them have tag 0 while b of them have
+    tag 1. Assume a node of tag 0 has an unnormalized probability p to be picked while a node of
+    tag 1 has q. This function first chooses a tag according to the unnormalized probability
+    distribution (ap, bq), and then run a uniform sampling within the nodes with the chosen tag.
+
+    In order to sample efficiently, we need to first sort the CSR matrix of the graph
+    according to the tag (See `dgl.transform.sort_in_edges` and `dgl.transform.sort_out_edges`
+    for details), which will arrange the neighbors with the same tag in a consecutive range
+    and store the offset of these ranges in a node feature with tag_offset_name as its name.
+
+    Please make sure that the graph has been sorted by the sorting function corresponding to
+    the edge direction ('in' or 'out'). This function itself will not check whether the graph is
+    sorted. Note that the input `tag_offset_name` should be consistent with that in the sorting
+    function.
+
+    Only homogeneous or bipartite graphs are supported. For bipartite graphs, only candidate
+    frontier nodes have tags(source nodes when edge_dir='in' and destination nodes when
+    edge_dir='out'), and the offset of tags should be stored as a node feature of the seed nodes.
+
+    Node/edge features are not preserved. The original IDs of
+    the sampled edges are stored as the `dgl.EID` feature in the returned graph.
+
+    Parameters
+    ----------
+    g : DGLGraph
+        The graph. Must be homogeneous or bipartite (only one edge type). Must be on CPU.
+    nodes : tensor or list
+        Node IDs to sample neighbors from.
+    fanout : int
+        The number of edges to be sampled for each node on each edge type.
+
+        If -1 is given, all the neighboring edges will be selected.
+    bias : tensor or list
+        The (unnormalized) probabilities associated with each tag. Its length should be equal
+        to the number of tags.
+
+        Entries of this array must be non-negative floats, and the sum of the entries must be
+        positive (though they don't have to sum up to one). Otherwise, the result will be
+        undefined.
+    edge_dir : str, optional
+        Determines whether to sample inbound or outbound edges.
+
+        Can take either ``in`` for inbound edges or ``out`` for outbound edges.
+    tag_offset_name : str, optional
+        The name of the node feature storing tag offsets.
+
+        (Default: "_TAG_OFFSET")
+    replace : bool, optional
+        If True, sample with replacement.
+    copy_ndata: bool, optional
+        If True, the node features of the new graph are copied from
+        the original graph. If False, the new graph will not have any
+        node features.
+
+        (Default: True)
+    copy_edata: bool, optional
+        If True, the edge features of the new graph are copied from
+        the original graph.  If False, the new graph will not have any
+        edge features.
+
+        (Default: True)
+
+    Returns
+    -------
+    DGLGraph
+        A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
+
+    Notes
+    -----
+    If :attr:`copy_ndata` or :attr:`copy_edata` is True, same tensors are used as
+    the node or edge features of the original graph and the new graph.
+    As a result, users should avoid performing in-place operations
+    on the node features of the new graph to avoid feature corruption.
+
+    Examples
+    --------
+    Assume that you have the following graph
+
+    >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]))
+
+    And the tags
+
+    >>> tag = torch.IntTensor([0, 0, 1])
+
+    Sort the graph (necessary!)
+
+    >>> g_sorted = dgl.transform.sort_out_edges(g, tag)
+    >>> g_sorted.ndata['_TAG_OFFSET']
+    tensor([[0, 1, 2],
+            [0, 2, 2],
+            [0, 1, 2]])
+
+    Set the probability of each tag:
+    >>> bias = torch.tensor([1.0, 0.001])
+        # node 2 is almost impossible to be sampled because it has tag 1.
+
+    To sample one out bound edge for node 0 and node 2:
+
+    >>> sg = dgl.sampling.sample_neighbors_biased(g_sorted, [0, 2], 1, bias, edge_dir='out')
+    >>> sg.edges(order='eid')
+    (tensor([0, 2]), tensor([1, 0]))
+    >>> sg.edata[dgl.EID]
+    tensor([0, 5])
+
+    With ``fanout`` greater than the number of actual neighbors and without replacement,
+    DGL will take all neighbors instead:
+
+    >>> sg = dgl.sampling.sample_neighbors_biased(g_sorted, [0, 2], 3, bias, edge_dir='out')
+    >>> sg.edges(order='eid')
+    (tensor([0, 0, 2, 2]), tensor([1, 2, 0, 2]))
+    """
+    if isinstance(nodes, list):
+        nodes = F.tensor(nodes)
+    if isinstance(bias, list):
+        bias = F.tensor(bias)
+
+    nodes_array = F.to_dgl_nd(nodes)
+    bias_array = F.to_dgl_nd(bias)
+    if edge_dir == 'in':
+        tag_offset_array = F.to_dgl_nd(g.dstdata[tag_offset_name])
+    elif edge_dir == 'out':
+        tag_offset_array = F.to_dgl_nd(g.srcdata[tag_offset_name])
+    else:
+        raise DGLError("edge_dir can only be 'in' or 'out'")
+
+    subgidx = _CAPI_DGLSampleNeighborsBiased(g._graph, nodes_array, fanout, bias_array,
+                                             tag_offset_array, edge_dir, replace)
+    induced_edges = subgidx.induced_edges
+    ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes)
+
+    if copy_ndata:
+        node_frames = utils.extract_node_subframes(g, None)
+        utils.set_new_frames(ret, node_frames=node_frames)
+
+    if copy_edata:
+        edge_frames = utils.extract_edge_subframes(g, induced_edges)
+        utils.set_new_frames(ret, edge_frames=edge_frames)
+
+    ret.edata[EID] = induced_edges[0]
+    return ret
+
+
 def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False,
                 copy_ndata=True, copy_edata=True):
     """Select the neighboring edges with k-largest (or k-smallest) weights of the given

diff --git a/src/array/array.cc b/src/array/array.cc
@@ -580,6 +580,23 @@ COOMatrix CSRRowWiseTopk(
   return ret;
 }
 
+COOMatrix CSRRowWiseSamplingBiased(
+    CSRMatrix mat,
+    IdArray rows,
+    int64_t num_samples,
+    NDArray tag_offset,
+    FloatArray bias,
+    bool replace) {
+  COOMatrix ret;
+  ATEN_CSR_SWITCH(mat, XPU, IdType, "CSRRowWiseSamplingBiased", {
+    ATEN_FLOAT_TYPE_SWITCH(bias->dtype, FloatType, "bias", {
+        ret = impl::CSRRowWiseSamplingBiased<XPU, IdType, FloatType>(
+          mat, rows, num_samples, tag_offset, bias, replace);
+    });
+  });
+  return ret;
+}
+
 
 CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs) {
   CSRMatrix ret;

diff --git a/src/array/array_op.h b/src/array/array_op.h
@@ -173,6 +173,16 @@ template <DLDeviceType XPU, typename IdType, typename DType>
 COOMatrix CSRRowWiseTopk(
     CSRMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending);
 
+template <DLDeviceType XPU, typename IdType, typename FloatType>
+COOMatrix CSRRowWiseSamplingBiased(
+    CSRMatrix mat,
+    IdArray rows,
+    int64_t num_samples,
+    NDArray tag_offset,
+    FloatArray bias,
+    bool replace
+);
+
 // Union CSRMatrixes
 template <DLDeviceType XPU, typename IdType>
 CSRMatrix UnionCsr(const std::vector<CSRMatrix>& csrs);

diff --git a/src/array/cpu/rowwise_sampling.cc b/src/array/cpu/rowwise_sampling.cc
@@ -59,6 +59,24 @@ inline PickFn<IdxType> GetSamplingUniformPickFn(
     };
   return pick_fn;
 }
+
+template <typename IdxType, typename FloatType>
+inline PickFn<IdxType> GetSamplingBiasedPickFn(
+    int64_t num_samples, IdArray split, FloatArray bias, bool replace) {
+  PickFn<IdxType> pick_fn = [num_samples, split, bias, replace]
+    (IdxType rowid, IdxType off, IdxType len,
+     const IdxType* col, const IdxType* data,
+     IdxType* out_idx) {
+    const IdxType *tag_offset = static_cast<IdxType *>(split->data) + rowid * split->shape[1];
+    RandomEngine::ThreadLocal()->BiasedChoice<IdxType, FloatType>(
+            num_samples, tag_offset, bias, out_idx, replace);
+    for (int64_t j = 0; j < num_samples; ++j) {
+      out_idx[j] += off;
+    }
+  };
+  return pick_fn;
+}
+
 }  // namespace
 
 /////////////////////////////// CSR ///////////////////////////////
@@ -92,6 +110,33 @@ template COOMatrix CSRRowWiseSamplingUniform<kDLCPU, int32_t>(
 template COOMatrix CSRRowWiseSamplingUniform<kDLCPU, int64_t>(
     CSRMatrix, IdArray, int64_t, bool);
 
+template <DLDeviceType XPU, typename IdxType, typename FloatType>
+COOMatrix CSRRowWiseSamplingBiased(
+    CSRMatrix mat,
+    IdArray rows,
+    int64_t num_samples,
+    NDArray tag_offset,
+    FloatArray bias,
+    bool replace
+) {
+  auto pick_fn = GetSamplingBiasedPickFn<IdxType, FloatType>(
+      num_samples, tag_offset, bias, replace);
+  return CSRRowWisePick(mat, rows, num_samples, replace, pick_fn);
+}
+
+template COOMatrix CSRRowWiseSamplingBiased<kDLCPU, int32_t, float>(
+  CSRMatrix, IdArray, int64_t, NDArray, FloatArray, bool);
+
+template COOMatrix CSRRowWiseSamplingBiased<kDLCPU, int64_t, float>(
+  CSRMatrix, IdArray, int64_t, NDArray, FloatArray, bool);
+
+template COOMatrix CSRRowWiseSamplingBiased<kDLCPU, int32_t, double>(
+  CSRMatrix, IdArray, int64_t, NDArray, FloatArray, bool);
+
+template COOMatrix CSRRowWiseSamplingBiased<kDLCPU, int64_t, double>(
+  CSRMatrix, IdArray, int64_t, NDArray, FloatArray, bool);
+
+
 /////////////////////////////// COO ///////////////////////////////
 
 template <DLDeviceType XPU, typename IdxType, typename FloatType>