[Test] Performance benchmarks for DGL kernels (dmlc#2582)

* add initial kernel benchmarks * finished kernel benchmarks * add desc
feixian15 · Jan 27, 2021 · 362f72c · 362f72c
1 parent 12f6429
commit 362f72c
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 18 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -23,10 +23,10 @@ to disk. It does not support specifying branches and commits either. They are on
 available under ASV's managed environment.**
 
 To change the device for benchmarking, set the `DGL_BENCH_DEVICE` environment variable.
-Any valid PyTorch device strings are allowed.
+Allowed values are `"cpu"` or `"gpu"`.
 
 ```bash
-export DGL_BENCH_DEVICE=cuda:0
+export DGL_BENCH_DEVICE=gpu
 ```
 
 To select which benchmark to run, use the `--bench` flag. For example,
@@ -49,7 +49,7 @@ DGL runs all benchmarks automatically in docker container. To run bencmarks in d
 * Use the `publish.sh` script. It accepts two arguments, a name specifying the identity of
   the test machine and a device name. For example,
   ```bash
-  bash publish.sh dev-machine cuda:0
+  bash publish.sh dev-machine gpu
   ```
 
 The script will output two folders `results` and `html`. The `html` folder contains the

diff --git a/benchmarks/benchmarks/kernel/__init__.py b/benchmarks/benchmarks/kernel/__init__.py
diff --git a/benchmarks/benchmarks/kernel/bench_gsddmm_u_dot_v.py b/benchmarks/benchmarks/kernel/bench_gsddmm_u_dot_v.py
@@ -0,0 +1,40 @@
+import time
+import dgl
+import torch
+
+from .. import utils
+
+def calc_gflops(graph, feat_size, num_heads, time):
+    return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2)  # count both mul and add
+
+# The benchmarks include broadcasting cases.
+# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
+#   while the edge feature shape will be (H, ), so tested operations will broadcast
+#   along the last dimension. The total FLOP is controlled by the feat_size no
+#   matter how many heads are there.
+# If num_heads = 0, it falls back to the normal element-wise operation without
+#   broadcasting.
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('num_heads', [0, 1, 4])
+def track_flops(graph, feat_size, num_heads):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='coo').to(device)
+    if num_heads == 0:
+        x = torch.randn(graph.num_nodes(), feat_size, device=device)
+    else:
+        x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
+
+    # dry run
+    for i in range(3):
+        y = dgl.ops.u_dot_v(graph, x, x)
+
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = dgl.ops.u_dot_v(graph, x, x)
+        accum += timer.time
+
+    return calc_gflops(graph, feat_size, num_heads, accum / 10)
diff --git a/benchmarks/benchmarks/kernel/bench_gspmm_copy_u.py b/benchmarks/benchmarks/kernel/bench_gspmm_copy_u.py
@@ -0,0 +1,37 @@
+import time
+import dgl
+import torch
+
+from .. import utils
+
+def calc_gflops(graph, feat_size, time):
+    return round(graph.num_edges() * feat_size / 1000000000 / time, 2)
+
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('reducer', ['sum', 'max'])
+def track_flops(graph, feat_size, reducer):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='csc').to(device)
+    x = torch.randn(graph.num_nodes(), feat_size, device=device)
+
+    if reducer == 'sum':
+        op = dgl.ops.copy_u_sum
+    elif reducer == 'max':
+        op = dgl.ops.copy_u_max
+    else:
+        raise ValueError('Invalid reducer', reducer)
+
+    # dry run
+    for i in range(3):
+        y = op(graph, x)
+
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = op(graph, x)
+        accum += timer.time
+
+    return calc_gflops(graph, feat_size, accum / 10)
diff --git a/benchmarks/benchmarks/kernel/bench_gspmm_u_mul_e_sum.py b/benchmarks/benchmarks/kernel/bench_gspmm_u_mul_e_sum.py
@@ -0,0 +1,42 @@
+import time
+import dgl
+import torch
+
+from .. import utils
+
+def calc_gflops(graph, feat_size, num_heads, time):
+    return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2)  # count both mul and add
+
+# The benchmarks include broadcasting cases.
+# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
+#   while the edge feature shape will be (H, ), so tested operations will broadcast
+#   along the last dimension. The total FLOP is controlled by the feat_size no
+#   matter how many heads are there.
+# If num_heads = 0, it falls back to the normal element-wise operation without
+#   broadcasting.
+@utils.benchmark('flops', timeout=600)
+@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
+@utils.parametrize('feat_size', [4, 32, 256])
+@utils.parametrize('num_heads', [0, 1, 4])
+def track_flops(graph, feat_size, num_heads):
+    device = utils.get_bench_device()
+    graph = utils.get_graph(graph, format='csc').to(device)
+    if num_heads == 0:
+        x = torch.randn(graph.num_nodes(), feat_size, device=device)
+        w = torch.randn(graph.num_edges(), feat_size, device=device)
+    else:
+        x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
+        w = torch.randn(graph.num_edges(), num_heads, 1, device=device)
+
+    # dry run
+    for i in range(3):
+        y = dgl.ops.u_mul_e_sum(graph, x, w)
+
+    # timing
+    accum = 0.
+    for i in range(10):
+        with utils.TorchOpTimer(device) as timer:
+            y = dgl.ops.u_mul_e_sum(graph, x, w)
+        accum += timer.time
+
+    return calc_gflops(graph, feat_size, num_heads, accum / 10)
diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py
@@ -9,7 +9,8 @@
 import pandas
 import dgl
 import torch
-
+import time
+from ogb.nodeproppred import DglNodePropPredDataset
 
 def _download(url, path, filename):
     fn = os.path.join(path, filename)
@@ -54,11 +55,17 @@ def get_graph(name, format):
         else:
             g = dgl.data.RedditDataset(self_loop=True)[0].formats([format])
             dgl.save_graphs(bin_path, [g])
+    elif name.startswith("ogb"):
+        g = get_ogb_graph(name)
     else:
         raise Exception("Unknown dataset")
     g = g.formats([format])
     return g
 
+def get_ogb_graph(name):
+    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
+    data = DglNodePropPredDataset(name=name)
+    return data[0][0]
 
 def get_livejournal():
     # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
@@ -84,14 +91,6 @@ def get_friendster():
     return dgl.graph((src, dst))
 
 
-# def get_graph(name):
-#     if name == 'livejournal':
-#         return get_livejournal()
-#     else:
-#         print(name + " doesn't exist")
-#         return None
-
-
 class OGBDataset(object):
     def __init__(self, g, num_labels, predict_category=None):
         self._g = g
@@ -116,8 +115,6 @@ def __getitem__(self, idx):
 
 def load_ogb_product():
     name = 'ogbn-products'
-    from ogb.nodeproppred import DglNodePropPredDataset
-
     os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
 
     print('load', name)
@@ -149,8 +146,6 @@ def load_ogb_product():
 
 def load_ogb_mag():
     name = 'ogbn-mag'
-    from ogb.nodeproppred import DglNodePropPredDataset
-
     os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
 
     print('load', name)
@@ -296,15 +291,21 @@ def setup_track_acc(*args, **kwargs):
     np.random.seed(42)
     torch.random.manual_seed(42)
 
+def setup_track_flops(*args, **kwargs):
+    # fix random seed
+    np.random.seed(42)
+    torch.random.manual_seed(42)
 
 TRACK_UNITS = {
     'time': 's',
     'acc': '%',
+    'flops': 'GFLOPS',
 }
 
 TRACK_SETUP = {
     'time': setup_track_time,
     'acc': setup_track_acc,
+    'flops': setup_track_flops,
 }
 
 
@@ -421,7 +422,7 @@ def check(self, func):
     parametrize_cpu = noop_decorator
     parametrize_gpu = parametrize
 else:
-    raise Exception("Unknown device")
+    raise Exception("Unknown device. Must be one of ['cpu', 'gpu'], but got {}".format(device))
 
 
 def skip_if_gpu():
@@ -447,6 +448,7 @@ def benchmark(track_type, timeout=60):
 
             - 'time' : For timing. Unit: second.
             - 'acc' : For accuracy. Unit: percentage, value between 0 and 100.
+            - 'flops' : Unit: GFlops, number of floating point operations per second.
     timeout : int
         Timeout threshold in second.
 
@@ -458,7 +460,7 @@ def benchmark(track_type, timeout=60):
         def foo():
             pass
     """
-    assert track_type in ['time', 'acc']
+    assert track_type in ['time', 'acc', 'flops']
 
     def _wrapper(func):
         func.unit = TRACK_UNITS[track_type]
@@ -469,3 +471,28 @@ def _wrapper(func):
             func.benchmark_name = "skip_" + func.__name__
         return func
     return _wrapper
+
+#####################################
+# Timer
+#####################################
+
+class TorchOpTimer:
+    def __init__(self, device):
+        self.device = device
+
+    def __enter__(self):
+        if self.device == 'cuda:0':
+            self.start_event = torch.cuda.Event(enable_timing=True)
+            self.end_event = torch.cuda.Event(enable_timing=True)
+            self.start_event.record()
+        else:
+            self.tic = time.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        if self.device == 'cuda:0':
+            self.end_event.record()
+            torch.cuda.synchronize()  # Wait for the events to be recorded!
+            self.time = self.start_event.elapsed_time(self.end_event) / 1e3
+        else:
+            self.time = time.time() - self.tic