Skip to content

Commit

Permalink
[Test] Performance benchmarks for DGL kernels (dmlc#2582)
Browse files Browse the repository at this point in the history
* add initial kernel benchmarks

* finished kernel benchmarks

* add desc
  • Loading branch information
jermainewang authored Jan 27, 2021
1 parent 12f6429 commit 362f72c
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 18 deletions.
6 changes: 3 additions & 3 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ to disk. It does not support specifying branches and commits either. They are on
available under ASV's managed environment.**

To change the device for benchmarking, set the `DGL_BENCH_DEVICE` environment variable.
Any valid PyTorch device strings are allowed.
Allowed values are `"cpu"` or `"gpu"`.

```bash
export DGL_BENCH_DEVICE=cuda:0
export DGL_BENCH_DEVICE=gpu
```

To select which benchmark to run, use the `--bench` flag. For example,
Expand All @@ -49,7 +49,7 @@ DGL runs all benchmarks automatically in docker container. To run bencmarks in d
* Use the `publish.sh` script. It accepts two arguments, a name specifying the identity of
the test machine and a device name. For example,
```bash
bash publish.sh dev-machine cuda:0
bash publish.sh dev-machine gpu
```

The script will output two folders `results` and `html`. The `html` folder contains the
Expand Down
Empty file.
40 changes: 40 additions & 0 deletions benchmarks/benchmarks/kernel/bench_gsddmm_u_dot_v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import time
import dgl
import torch

from .. import utils

def calc_gflops(graph, feat_size, num_heads, time):
return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2) # count both mul and add

# The benchmarks include broadcasting cases.
# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
# while the edge feature shape will be (H, ), so tested operations will broadcast
# along the last dimension. The total FLOP is controlled by the feat_size no
# matter how many heads are there.
# If num_heads = 0, it falls back to the normal element-wise operation without
# broadcasting.
@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('num_heads', [0, 1, 4])
def track_flops(graph, feat_size, num_heads):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='coo').to(device)
if num_heads == 0:
x = torch.randn(graph.num_nodes(), feat_size, device=device)
else:
x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)

# dry run
for i in range(3):
y = dgl.ops.u_dot_v(graph, x, x)

# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = dgl.ops.u_dot_v(graph, x, x)
accum += timer.time

return calc_gflops(graph, feat_size, num_heads, accum / 10)
37 changes: 37 additions & 0 deletions benchmarks/benchmarks/kernel/bench_gspmm_copy_u.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import time
import dgl
import torch

from .. import utils

def calc_gflops(graph, feat_size, time):
return round(graph.num_edges() * feat_size / 1000000000 / time, 2)

@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('reducer', ['sum', 'max'])
def track_flops(graph, feat_size, reducer):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='csc').to(device)
x = torch.randn(graph.num_nodes(), feat_size, device=device)

if reducer == 'sum':
op = dgl.ops.copy_u_sum
elif reducer == 'max':
op = dgl.ops.copy_u_max
else:
raise ValueError('Invalid reducer', reducer)

# dry run
for i in range(3):
y = op(graph, x)

# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = op(graph, x)
accum += timer.time

return calc_gflops(graph, feat_size, accum / 10)
42 changes: 42 additions & 0 deletions benchmarks/benchmarks/kernel/bench_gspmm_u_mul_e_sum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import time
import dgl
import torch

from .. import utils

def calc_gflops(graph, feat_size, num_heads, time):
return round(2 * graph.num_edges() * feat_size / 1000000000 / time, 2) # count both mul and add

# The benchmarks include broadcasting cases.
# Given feat_size = D, num_heads = H, the node feature shape will be (H, D // H)
# while the edge feature shape will be (H, ), so tested operations will broadcast
# along the last dimension. The total FLOP is controlled by the feat_size no
# matter how many heads are there.
# If num_heads = 0, it falls back to the normal element-wise operation without
# broadcasting.
@utils.benchmark('flops', timeout=600)
@utils.parametrize('graph', ['ogbn-arxiv', 'reddit', 'ogbn-proteins'])
@utils.parametrize('feat_size', [4, 32, 256])
@utils.parametrize('num_heads', [0, 1, 4])
def track_flops(graph, feat_size, num_heads):
device = utils.get_bench_device()
graph = utils.get_graph(graph, format='csc').to(device)
if num_heads == 0:
x = torch.randn(graph.num_nodes(), feat_size, device=device)
w = torch.randn(graph.num_edges(), feat_size, device=device)
else:
x = torch.randn(graph.num_nodes(), num_heads, feat_size // num_heads, device=device)
w = torch.randn(graph.num_edges(), num_heads, 1, device=device)

# dry run
for i in range(3):
y = dgl.ops.u_mul_e_sum(graph, x, w)

# timing
accum = 0.
for i in range(10):
with utils.TorchOpTimer(device) as timer:
y = dgl.ops.u_mul_e_sum(graph, x, w)
accum += timer.time

return calc_gflops(graph, feat_size, num_heads, accum / 10)
57 changes: 42 additions & 15 deletions benchmarks/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import pandas
import dgl
import torch

import time
from ogb.nodeproppred import DglNodePropPredDataset

def _download(url, path, filename):
fn = os.path.join(path, filename)
Expand Down Expand Up @@ -54,11 +55,17 @@ def get_graph(name, format):
else:
g = dgl.data.RedditDataset(self_loop=True)[0].formats([format])
dgl.save_graphs(bin_path, [g])
elif name.startswith("ogb"):
g = get_ogb_graph(name)
else:
raise Exception("Unknown dataset")
g = g.formats([format])
return g

def get_ogb_graph(name):
os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))
data = DglNodePropPredDataset(name=name)
return data[0][0]

def get_livejournal():
# Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
Expand All @@ -84,14 +91,6 @@ def get_friendster():
return dgl.graph((src, dst))


# def get_graph(name):
# if name == 'livejournal':
# return get_livejournal()
# else:
# print(name + " doesn't exist")
# return None


class OGBDataset(object):
def __init__(self, g, num_labels, predict_category=None):
self._g = g
Expand All @@ -116,8 +115,6 @@ def __getitem__(self, idx):

def load_ogb_product():
name = 'ogbn-products'
from ogb.nodeproppred import DglNodePropPredDataset

os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

print('load', name)
Expand Down Expand Up @@ -149,8 +146,6 @@ def load_ogb_product():

def load_ogb_mag():
name = 'ogbn-mag'
from ogb.nodeproppred import DglNodePropPredDataset

os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

print('load', name)
Expand Down Expand Up @@ -296,15 +291,21 @@ def setup_track_acc(*args, **kwargs):
np.random.seed(42)
torch.random.manual_seed(42)

def setup_track_flops(*args, **kwargs):
# fix random seed
np.random.seed(42)
torch.random.manual_seed(42)

TRACK_UNITS = {
'time': 's',
'acc': '%',
'flops': 'GFLOPS',
}

TRACK_SETUP = {
'time': setup_track_time,
'acc': setup_track_acc,
'flops': setup_track_flops,
}


Expand Down Expand Up @@ -421,7 +422,7 @@ def check(self, func):
parametrize_cpu = noop_decorator
parametrize_gpu = parametrize
else:
raise Exception("Unknown device")
raise Exception("Unknown device. Must be one of ['cpu', 'gpu'], but got {}".format(device))


def skip_if_gpu():
Expand All @@ -447,6 +448,7 @@ def benchmark(track_type, timeout=60):
- 'time' : For timing. Unit: second.
- 'acc' : For accuracy. Unit: percentage, value between 0 and 100.
- 'flops' : Unit: GFlops, number of floating point operations per second.
timeout : int
Timeout threshold in second.
Expand All @@ -458,7 +460,7 @@ def benchmark(track_type, timeout=60):
def foo():
pass
"""
assert track_type in ['time', 'acc']
assert track_type in ['time', 'acc', 'flops']

def _wrapper(func):
func.unit = TRACK_UNITS[track_type]
Expand All @@ -469,3 +471,28 @@ def _wrapper(func):
func.benchmark_name = "skip_" + func.__name__
return func
return _wrapper

#####################################
# Timer
#####################################

class TorchOpTimer:
def __init__(self, device):
self.device = device

def __enter__(self):
if self.device == 'cuda:0':
self.start_event = torch.cuda.Event(enable_timing=True)
self.end_event = torch.cuda.Event(enable_timing=True)
self.start_event.record()
else:
self.tic = time.time()
return self

def __exit__(self, type, value, traceback):
if self.device == 'cuda:0':
self.end_event.record()
torch.cuda.synchronize() # Wait for the events to be recorded!
self.time = self.start_event.elapsed_time(self.end_event) / 1e3
else:
self.time = time.time() - self.tic

0 comments on commit 362f72c

Please sign in to comment.