Skip to content

Commit

Permalink
[Feature] Random walk traces generation (dmlc#392)
Browse files Browse the repository at this point in the history
* random walk traces generation

* remove outdated comments

* oops put in the wrong place

* explicit inline

* moving rand_r to util

* moving random walk to public function

* per-thread seed and openmp support

* type cast styles
  • Loading branch information
BarclayII authored Feb 20, 2019
1 parent f370e62 commit 2e1cbd5
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 7 deletions.
8 changes: 8 additions & 0 deletions include/dgl/runtime/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@

#include "c_runtime_api.h"

#ifdef _MSC_VER
// rand in MS compiler works well in multi-threading.
static inline int rand_r(unsigned *seed) {
return rand();
}
#define _CRT_RAND_S
#endif

namespace dgl {
namespace runtime {

Expand Down
13 changes: 13 additions & 0 deletions include/dgl/sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@ class SamplerOp {
static NodeFlow NeighborUniformSample(const ImmutableGraph *graph, IdArray seeds,
const std::string &edge_type,
int num_hops, int expand_factor);

/*!
* \brief Batch-generate random walk traces
* \param seeds The array of starting vertex IDs
* \param num_traces The number of traces to generate for each seed
* \param num_hops The number of hops for each trace
* \return a flat ID array with shape (num_seeds, num_traces, num_hops + 1)
*/
// TODO: move to sampler.cc
static IdArray RandomWalk(const GraphInterface *gptr,
IdArray seeds,
int num_traces,
int num_hops);
};

} // namespace dgl
Expand Down
1 change: 1 addition & 0 deletions python/dgl/contrib/sampling/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .sampler import NeighborSampler
from .randomwalk import *
31 changes: 31 additions & 0 deletions python/dgl/contrib/sampling/randomwalk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

from ... import utils
from ... import backend as F

__all__ = ['random_walk']


def random_walk(g, seeds, num_traces, num_hops):
"""Batch-generate random walk traces on given graph with the same length.
Parameters
----------
g : DGLGraph
The graph. Must be readonly.
seeds : Tensor
The node ID tensor from which the random walk traces starts.
num_traces : int
Number of traces to generate for each seed.
num_hops : int
Number of hops for each trace.
Returns
-------
traces : Tensor
A 3-dimensional node ID tensor with shape
(num_seeds, num_traces, num_hops + 1)
traces[i, j, 0] are always starting nodes (i.e. seed[i]).
"""
return g._graph.random_walk(utils.toindex(seeds), num_traces, num_hops)
14 changes: 14 additions & 0 deletions python/dgl/graph_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,20 @@ def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type, no
utils.toindex(rst(num_subgs * 3 + i)),
utils.toindex(rst(num_subgs * 4 + i))) for i in range(num_subgs)]

def random_walk(self, seeds, num_traces, num_hops):
"""Random walk sampling.
Returns a user Tensor of random walk traces with shape
(num_seeds, num_traces, num_hops + 1)
"""
if len(seeds) == 0:
return utils.toindex([])

seeds = seeds.todgltensor()
traces = _CAPI_DGLGraphRandomWalk(self._handle, seeds, num_traces, num_hops)

return F.zerocopy_from_dlpack(traces.to_dlpack())

def to_networkx(self):
"""Convert to networkx graph.
Expand Down
11 changes: 11 additions & 0 deletions src/graph/graph_apis.cc
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,15 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj")
*rv = ConvertAdjToPackedFunc(res);
});

DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphRandomWalk")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
GraphHandle ghandle = args[0];
const IdArray seeds = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
const int num_traces = args[2];
const int num_hops = args[3];
const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);

*rv = SamplerOp::RandomWalk(ptr, seeds, num_traces, num_hops);
});

} // namespace dgl
2 changes: 0 additions & 2 deletions src/graph/immutable_graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
*/

#include <dgl/immutable_graph.h>
#include <cstdlib>
#include <cmath>

#ifdef _MSC_VER
#define _CRT_RAND_S
Expand Down
54 changes: 49 additions & 5 deletions src/graph/sampler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include <dgl/sampler.h>
#include <dgl/immutable_graph.h>
#include <algorithm>
#include <cstdlib>
#include <cmath>
#include <omp.h>

#ifdef _MSC_VER
// rand in MS compiler works well in multi-threading.
Expand Down Expand Up @@ -305,7 +308,7 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
}
layer_off_data[0] = 0;
layer_off_data[1] = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
size_t out_layer_idx = 1;
int out_layer_idx = 1;
for (int layer_id = num_hops - 2; layer_id >= 0; layer_id--) {
std::sort(neigh_pos->begin() + layer_offsets[layer_id],
neigh_pos->begin() + layer_offsets[layer_id + 1],
Expand Down Expand Up @@ -345,14 +348,14 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,

// Copy flow offsets.
flow_off_data[0] = 0;
size_t out_flow_idx = 0;
for (int i = 0; i < layer_offsets.size() - 2; i++) {
int out_flow_idx = 0;
for (size_t i = 0; i < layer_offsets.size() - 2; i++) {
size_t num_edges = subg_csr->GetDegree(layer_off_data[i + 1], layer_off_data[i + 2]);
flow_off_data[out_flow_idx + 1] = flow_off_data[out_flow_idx] + num_edges;
out_flow_idx++;
}
CHECK(out_flow_idx == num_hops - 1);
CHECK(flow_off_data[num_hops - 1] == num_edges);
CHECK(flow_off_data[num_hops - 1] == static_cast<uint64_t>(num_edges));

for (size_t i = 0; i < subg_csr->edge_ids.size(); i++) {
subg_csr->edge_ids[i] = i;
Expand Down Expand Up @@ -404,7 +407,7 @@ NodeFlow SampleSubgraph(const ImmutableGraph *graph,

layer_offsets[0] = 0;
layer_offsets[1] = sub_vers.size();
for (size_t layer_id = 1; layer_id < num_hops; layer_id++) {
for (int layer_id = 1; layer_id < num_hops; layer_id++) {
// We need to avoid resampling the same node in a layer, but we allow a node
// to be resampled in multiple layers. We use `sub_ver_map` to keep track of
// sampled nodes in a layer, and clear it when entering a new layer.
Expand Down Expand Up @@ -479,4 +482,45 @@ NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph, IdArray s
expand_factor);
}

IdArray SamplerOp::RandomWalk(
const GraphInterface *gptr,
IdArray seeds,
int num_traces,
int num_hops) {
const int num_nodes = seeds->shape[0];
const dgl_id_t *seed_ids = static_cast<dgl_id_t *>(seeds->data);
IdArray traces = IdArray::Empty(
{num_nodes, num_traces, num_hops + 1},
DLDataType{kDLInt, 64, 1},
DLContext{kDLCPU, 0});
dgl_id_t *trace_data = static_cast<dgl_id_t *>(traces->data);

#pragma omp parallel
{
// get per-thread seed
unsigned int random_seed = time(nullptr) ^ omp_get_thread_num();

#pragma omp for
for (int i = 0; i < num_nodes; ++i) {
const dgl_id_t seed_id = seed_ids[i];

for (int j = 0; j < num_traces; ++j) {
dgl_id_t cur = seed_id;
const int kmax = num_hops + 1;

for (int k = 0; k < kmax; ++k) {
const size_t offset = ((size_t)i * num_traces + j) * kmax + k;
trace_data[offset] = cur;

const auto succ = gptr->SuccVec(cur);
const size_t size = succ.size();
cur = succ[rand_r(&random_seed) % size];
}
}
}
}

return traces;
}

} // namespace dgl
21 changes: 21 additions & 0 deletions tests/compute/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,29 @@ def test_10neighbor_sampler():
check_10neighbor_sampler(g, seeds=np.unique(np.random.randint(0, g.number_of_nodes(),
size=int(g.number_of_nodes() / 10))))

def test_random_walk():
edge_list = [(0, 1), (1, 2), (2, 3), (3, 4),
(4, 3), (3, 2), (2, 1), (1, 0)]
seeds = [0, 1]
n_traces = 3
n_hops = 4

g = dgl.DGLGraph(edge_list, readonly=True)
traces = dgl.contrib.sampling.random_walk(g, seeds, n_traces, n_hops)
traces = F.zerocopy_to_numpy(traces)

assert traces.shape == (len(seeds), n_traces, n_hops + 1)

for i, seed in enumerate(seeds):
assert (traces[i, :, 0] == seeds[i]).all()

trace_diff = np.diff(traces, axis=-1)
# only nodes with adjacent IDs are connected
assert (np.abs(trace_diff) == 1).all()

if __name__ == '__main__':
test_1neighbor_sampler_all()
test_10neighbor_sampler_all()
test_1neighbor_sampler()
test_10neighbor_sampler()
test_random_walk()

0 comments on commit 2e1cbd5

Please sign in to comment.