Add API benchmarks (dmlc#2522)

* add bench jenkins * instance type * fix * fix * fix * 111 * test * 111 * 111 * fix * test * run * fix * fix * fix * fix * fix * publish results * 111 * regression * launch ec2 script * fix * add * run on master * change * rrr * run gpu * fix * fix * try fix * fix * ff * fix * fix * fix * refactor * fix * fix * update * fix * fix * fix * fix * remove import torchtext * add shm size * update * fix * fix * fix * fix * fix this!!!! * 111 * fix * remove verbose * fix * fix * fix * fix * fix * fix * fix * fix * update readme * fix * fix * fix * change asv default to head * commit sage and rgcn * fix * update * add benchmarks * add * fix * update * remove RandomState * tmp remove Co-authored-by: Minjie Wang <[email protected]>
zeta1999 · Jan 14, 2021 · c29daae · c29daae
1 parent 0778766
commit c29daae
Show file tree

Hide file tree

Showing 6 changed files with 250 additions and 11 deletions.
diff --git a/benchmarks/benchmarks/api/bench_edge_ids.py b/benchmarks/benchmarks/api/bench_edge_ids.py
@@ -0,0 +1,37 @@
+import time
+import dgl
+import torch
+import numpy as np
+
+from .. import utils
+# edge_ids is not supported on cuda
+# @utils.skip_if_gpu()
+@utils.benchmark('time', timeout=1200)
+@utils.parametrize_cpu('graph_name', ['cora', 'livejournal', 'friendster'])
+@utils.parametrize_gpu('graph_name', ['cora', 'livejournal'])
+@utils.parametrize('format', ['csr'])  # csr/csc is not supported
+@utils.parametrize('fraction', [0.01, 0.1])
+@utils.parametrize('return_uv', [True, False])
+def track_time(graph_name, format, fraction, return_uv):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph_name, format)
+    coo_graph = utils.get_graph(graph_name, 'coo')
+    graph = graph.to(device)
+    eids = np.random.choice(
+        np.arange(graph.num_edges(), dtype=np.int64), int(graph.num_edges()*fraction))
+    eids = torch.tensor(eids, device="cpu", dtype=torch.int64)
+    u, v = coo_graph.find_edges(eids)
+    del coo_graph, eids
+    u = u.to(device)
+    v = v.to(device)
+    # dry run
+    for i in range(10):
+        out = graph.edge_ids(u[0], v[0])
+
+    # timing
+    t0 = time.time()
+    for i in range(10):
+        edges = graph.edge_ids(u, v, return_uv=return_uv)
+    t1 = time.time()
+
+    return (t1 - t0) / 10
diff --git a/benchmarks/benchmarks/api/bench_find_edges.py b/benchmarks/benchmarks/api/bench_find_edges.py
@@ -0,0 +1,33 @@
+import time
+import dgl
+import torch
+import numpy as np
+
+from .. import utils
+
+
+@utils.benchmark('time', timeout=1200)
+@utils.parametrize_cpu('graph_name', ['cora', 'livejournal', 'friendster'])
+@utils.parametrize_gpu('graph_name', ['cora', 'livejournal'])
+@utils.parametrize('format', ['coo'])  # csc is not supported
+@utils.parametrize('fraction', [0.01, 0.1])
+def track_time(graph_name, format, fraction):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph_name, format)
+    graph = graph.to(device)
+    eids = np.random.choice(
+        np.arange(graph.num_edges(), dtype=np.int64), int(graph.num_edges()*fraction))
+    eids = torch.tensor(eids, device=device, dtype=torch.int64)
+    # dry run
+    for i in range(10):
+        out = graph.find_edges(i)
+        out = graph.find_edges(torch.arange(
+            i*10, dtype=torch.int64, device=device))
+
+    # timing
+    t0 = time.time()
+    for i in range(10):
+        edges = graph.find_edges(eids)
+    t1 = time.time()
+
+    return (t1 - t0) / 10
diff --git a/benchmarks/benchmarks/api/bench_format_conversion.py b/benchmarks/benchmarks/api/bench_format_conversion.py
@@ -0,0 +1,31 @@
+import time
+import dgl
+import torch
+import numpy as np
+
+from .. import utils
+
+
+@utils.benchmark('time', timeout=1200)
+@utils.parametrize_cpu('graph_name', ['cora', 'livejournal', 'friendster'])
+@utils.parametrize_gpu('graph_name', ['cora', 'livejournal'])
+@utils.parametrize('format',
+                   [('coo', 'csc'), ('csc', 'coo'),
+                    ('coo', 'csr'), ('csr', 'coo'),
+                    ('csr', 'csc'), ('csc', 'csr')])
+def track_time(graph_name, format):
+    from_format, to_format = format
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph_name, from_format)
+    graph = graph.to(device)
+    graph = graph.formats([from_format])
+    # dry run
+    graph.formats([to_format])
+
+    # timing
+    t0 = time.time()
+    for i in range(10):
+        gg = graph.formats([to_format])
+    t1 = time.time()
+
+    return (t1 - t0) / 10
diff --git a/benchmarks/benchmarks/api/bench_in_degrees.py b/benchmarks/benchmarks/api/bench_in_degrees.py
@@ -0,0 +1,34 @@
+import time
+import dgl
+import torch
+import numpy as np
+
+from .. import utils
+
+
+@utils.benchmark('time', timeout=1200)
+@utils.parametrize_cpu('graph_name', ['cora', 'livejournal', 'friendster'])
+@utils.parametrize_gpu('graph_name', ['cora', 'livejournal'])
+# in_degrees on coo is not supported on cuda
+@utils.parametrize_cpu('format', ['coo', 'csc'])
+@utils.parametrize_gpu('format', ['csc'])
+@utils.parametrize('fraction', [0.01, 0.1])
+def track_time(graph_name, format, fraction):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph_name, format)
+    graph = graph.to(device)
+    nids = np.random.choice(
+        np.arange(graph.num_nodes(), dtype=np.int64), int(graph.num_nodes()*fraction))
+    nids = torch.tensor(nids, device=device, dtype=torch.int64)
+
+    # dry run
+    for i in range(10):
+        out = graph.in_degrees(i)
+
+    # timing
+    t0 = time.time()
+    for i in range(10):
+        edges = graph.in_degrees(nids)
+    t1 = time.time()
+
+    return (t1 - t0) / 10
diff --git a/benchmarks/benchmarks/api/bench_in_edges.py b/benchmarks/benchmarks/api/bench_in_edges.py
@@ -0,0 +1,34 @@
+import time
+import dgl
+import torch
+import numpy as np
+
+from .. import utils
+
+
+@utils.benchmark('time', timeout=1200)
+@utils.parametrize_cpu('graph_name', ['cora', 'livejournal', 'friendster'])
+@utils.parametrize_gpu('graph_name', ['cora', 'livejournal'])
+# in_edges on coo is not supported on cuda
+@utils.parametrize_cpu('format', ['coo', 'csc'])
+@utils.parametrize_gpu('format', ['csc'])
+@utils.parametrize('fraction', [0.01, 0.1])
+def track_time(graph_name, format, fraction):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph_name, format)
+    graph = graph.to(device)
+    nids = np.random.choice(
+        np.arange(graph.num_nodes(), dtype=np.int64), int(graph.num_nodes()*fraction))
+    nids = torch.tensor(nids, device=device, dtype=torch.int64)
+
+    # dry run
+    for i in range(10):
+        out = graph.in_edges(i)
+
+    # timing
+    t0 = time.time()
+    for i in range(10):
+        edges = graph.in_edges(nids)
+    t1 = time.time()
+
+    return (t1 - t0) / 10
diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py
@@ -26,36 +26,70 @@ def _download(url, path, filename):
     print('Download finished.')
 
 
+def get_graph(name, format):
+    g = None
+    if name == 'cora':
+        g = dgl.data.CoraGraphDataset()[0]
+    elif name == 'livejournal':
+        bin_path = "/tmp/dataset/livejournal/livejournal_{}.bin".format(format)
+        if os.path.exists(bin_path):
+            g_list, _ = dgl.load_graphs(bin_path)
+            g = g_list[0]
+        else:
+            g = get_livejournal().formats([format])
+            dgl.save_graphs(bin_path, [g])
+    elif name == "friendster":
+        bin_path = "/tmp/dataset/friendster/friendster_{}.bin".format(format)
+        if os.path.exists(bin_path):
+            g_list, _ = dgl.load_graphs(bin_path)
+            g = g_list[0]
+        else:
+            g = get_friendster().formats([format])
+            dgl.save_graphs(bin_path, [g])
+    elif name == "reddit":
+        bin_path = "/tmp/dataset/reddit/reddit_{}.bin".format(format)
+        if os.path.exists(bin_path):
+            g_list, _ = dgl.load_graphs(bin_path)
+            g = g_list[0]
+        else:
+            g = dgl.data.RedditDataset(self_loop=True)[0].formats([format])
+            dgl.save_graphs(bin_path, [g])
+    else:
+        raise Exception("Unknown dataset")
+    g = g.formats([format])
+    return g
+
+
 def get_livejournal():
     # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
     _download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/livejournal/soc-LiveJournal1.txt.gz',
-              '/tmp/dataset', 'soc-LiveJournal1.txt.gz')
-    df = pandas.read_csv('/tmp/dataset/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
+              '/tmp/dataset/livejournal', 'soc-LiveJournal1.txt.gz')
+    df = pandas.read_csv('/tmp/dataset/livejournal/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
                          names=['src', 'dst'], compression='gzip')
     src = df['src'].values
     dst = df['dst'].values
     print('construct the graph')
     return dgl.graph((src, dst))
 
 
-def get_filmbaster():
+def get_friendster():
     # Same as https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz
     _download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/friendster/com-friendster.ungraph.txt.gz',
-              '/tmp/dataset', 'com-friendster.ungraph.txt.gz')
-    df = pandas.read_csv('/tmp/dataset/com-friendster.ungraph.txt.gz', sep='\t', skiprows=4, header=None,
+              '/tmp/dataset/friendster', 'com-friendster.ungraph.txt.gz')
+    df = pandas.read_csv('/tmp/dataset/friendster/com-friendster.ungraph.txt.gz', sep='\t', skiprows=4, header=None,
                          names=['src', 'dst'], compression='gzip')
     src = df['src'].values
     dst = df['dst'].values
     print('construct the graph')
     return dgl.graph((src, dst))
 
 
-def get_graph(name):
-    if name == 'livejournal':
-        return get_livejournal()
-    else:
-        print(name + " doesn't exist")
-        return None
+# def get_graph(name):
+#     if name == 'livejournal':
+#         return get_livejournal()
+#     else:
+#         print(name + " doesn't exist")
+#         return None
 
 
 class OGBDataset(object):
@@ -79,6 +113,7 @@ def predict_category(self):
     def __getitem__(self, idx):
         return self._g
 
+
 def load_ogb_product():
     name = 'ogbn-products'
     from ogb.nodeproppred import DglNodePropPredDataset
@@ -111,6 +146,7 @@ def load_ogb_product():
 
     return OGBDataset(graph, num_labels)
 
+
 def load_ogb_mag():
     name = 'ogbn-mag'
     from ogb.nodeproppred import DglNodePropPredDataset
@@ -146,6 +182,7 @@ def load_ogb_mag():
     num_classes = dataset.num_classes
     return OGBDataset(hg, num_classes, 'paper')
 
+
 class PinsageDataset:
     def __init__(self, g, user_ntype, item_ntype, textset):
         self._g = g
@@ -334,6 +371,14 @@ def _wrapper(func):
     return _wrapper
 
 
+def noop_decorator(param_name, params):
+    """noop decorator
+    """
+    def _wrapper(func):
+        return func
+    return _wrapper
+
+
 class TestFilter:
     def __init__(self):
         self.conf = None
@@ -367,6 +412,31 @@ def check(self, func):
 filter = TestFilter()
 
 
+device = os.environ.get('DGL_BENCH_DEVICE', 'cpu')
+
+if device == "cpu":
+    parametrize_cpu = parametrize
+    parametrize_gpu = noop_decorator
+elif device == "gpu":
+    parametrize_cpu = noop_decorator
+    parametrize_gpu = parametrize
+else:
+    raise Exception("Unknown device")
+
+
+def skip_if_gpu():
+    """skip if DGL_BENCH_DEVICE is gpu
+    """
+    device = os.environ.get('DGL_BENCH_DEVICE', 'cpu')
+
+    def _wrapper(func):
+        if device == "gpu":
+            # skip if not enabled
+            func.benchmark_name = "skip_" + func.__name__
+        return func
+    return _wrapper
+
+
 def benchmark(track_type, timeout=60):
     """Decorator for indicating the benchmark type.