Skip to content

Commit

Permalink
[Test] Tweak end2end benchmarks to be more reasonable (dmlc#2643)
Browse files Browse the repository at this point in the history
* change timeout to reasonable ranges

* rgcn ns

* fix all ns speed tests

Co-authored-by: Jinjing Zhou <[email protected]>
  • Loading branch information
jermainewang and VoVAllen authored Feb 9, 2021
1 parent e4ff484 commit 22ccf43
Show file tree
Hide file tree
Showing 9 changed files with 225 additions and 173 deletions.
53 changes: 29 additions & 24 deletions benchmarks/benchmarks/model_acc/bench_rgcn_ns.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,21 +182,21 @@ def evaluate(model, embed_layer, eval_loader, node_feats):
with th.no_grad():
for sample_data in eval_loader:
th.cuda.empty_cache()
seeds, blocks = sample_data
_, _, blocks = sample_data
feats = embed_layer(blocks[0].srcdata[dgl.NID],
blocks[0].srcdata[dgl.NTYPE],
blocks[0].srcdata['type_id'],
node_feats)
logits = model(blocks, feats)
eval_logits.append(logits.cpu().detach())
eval_seeds.append(seeds.cpu().detach())
eval_seeds.append(blocks[-1].dstdata['type_id'].cpu().detach())
eval_logits = th.cat(eval_logits)
eval_seeds = th.cat(eval_seeds)

return eval_logits, eval_seeds


@utils.benchmark('time', 3600)
@utils.benchmark('time', 3600) # ogbn-mag takes ~1 hour to train
@utils.parametrize('data', ['am', 'ogbn-mag'])
def track_acc(data):
dataset = utils.process_data(data)
Expand All @@ -205,9 +205,11 @@ def track_acc(data):
if data == 'am':
n_bases = 40
l2norm = 5e-4
n_epochs = 20
elif data == 'ogbn-mag':
n_bases = 2
l2norm = 0
n_epochs = 20
else:
raise ValueError()

Expand All @@ -218,7 +220,6 @@ def track_acc(data):
dropout = 0.5
use_self_loop = True
lr = 0.01
n_epochs = 20
low_mem = True
num_workers = 4

Expand Down Expand Up @@ -264,26 +265,28 @@ def track_acc(data):
node_tids = g.ndata[dgl.NTYPE]
loc = (node_tids == category_id)
target_nids = node_ids[loc]
train_nids = target_nids[train_idx]

# Create csr/coo/csc formats before launching training processes with multi-gpu.
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_()
g = g.formats('csc')
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
collator = dgl.dataloading.NodeCollator(g, train_nids, sampler, return_indices=True)
loader = dgl.dataloading.DataLoader(
collator.dataset, collate_fn=collator.collate,
batch_size=batch_size, shuffle=True, num_workers=4)
# test_sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
test_loader = DataLoader(dataset=test_idx.numpy(),
batch_size=batch_size,
collate_fn=collator.collate,
shuffle=False,
num_workers=4)
train_loader = dgl.dataloading.NodeDataLoader(
g,
target_nids[train_idx],
sampler,
batch_size=batch_size,
shuffle=True,
drop_last=False,
num_workers=num_workers)
test_loader = dgl.dataloading.NodeDataLoader(
g,
target_nids[test_idx],
sampler,
batch_size=batch_size,
shuffle=True,
drop_last=False,
num_workers=num_workers)

# node features
# None for one-hot feature, if not none, it should be the feature tensor.
#
embed_layer = RelGraphEmbedLayer(device,
g.number_of_nodes(),
node_tids,
Expand Down Expand Up @@ -314,28 +317,30 @@ def track_acc(data):
emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)

print("start training...")
t0 = time.time()
for epoch in range(n_epochs):
model.train()
embed_layer.train()

for i, sample_data in enumerate(loader):
input_nodes, output_nodes, seed_idx, blocks = sample_data
for i, sample_data in enumerate(train_loader):
input_nodes, output_nodes, blocks = sample_data
feats = embed_layer(input_nodes,
blocks[0].srcdata['ntype'],
blocks[0].srcdata['type_id'],
node_feats)
logits = model(blocks, feats)
loss = F.cross_entropy(logits, labels[train_idx][seed_idx])
seed_idx = blocks[-1].dstdata['type_id']
loss = F.cross_entropy(logits, labels[seed_idx])
optimizer.zero_grad()
emb_optimizer.zero_grad()

loss.backward()
optimizer.step()
emb_optimizer.step()

print('start testing...')

test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item()
test_acc = th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)
t1 = time.time()

return test_acc
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/model_acc/bench_sage_ns.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def load_subtensor(g, seeds, input_nodes, device):
return batch_inputs, batch_labels


@utils.benchmark('acc', 3600)
@utils.benchmark('acc', 600)
@utils.parametrize('data', ['ogbn-products', "reddit"])
def track_acc(data):
data = utils.process_data(data)
Expand Down
43 changes: 23 additions & 20 deletions benchmarks/benchmarks/model_speed/bench_gat_ns.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def load_subtensor(g, seeds, input_nodes, device):
batch_labels = g.ndata['labels'][seeds].to(device)
return batch_inputs, batch_labels

@utils.benchmark('time', 3600)
@utils.benchmark('time', 600)
@utils.parametrize('data', ['reddit', 'ogbn-products'])
def track_time(data):
data = utils.process_data(data)
Expand All @@ -82,7 +82,6 @@ def track_time(data):
# This avoids creating certain formats in each sub-process, which saves momory and CPU.
g.create_formats_()

num_epochs = 20
num_hidden = 16
num_heads = 8
num_layers = 2
Expand Down Expand Up @@ -113,7 +112,7 @@ def track_time(data):
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# dry run one epoch
# dry run
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
Expand All @@ -128,27 +127,31 @@ def track_time(data):
loss.backward()
optimizer.step()

if step >= 3:
break

# Training loop
avg = 0
iter_tput = []
t0 = time.time()
for epoch in range(num_epochs):
# Loop over the dataloader to sample the computation dependency graph as a list of
# blocks.
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
#batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']

# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Loop over the dataloader to sample the computation dependency graph as a list of
# blocks.
for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels
blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']

# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()

if step >= 9: # time 10 loops
break

t1 = time.time()

return (t1 - t0) / num_epochs
return (t1 - t0) / (step + 1)
41 changes: 20 additions & 21 deletions benchmarks/benchmarks/model_speed/bench_pinsage.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def collate_test(self, samples):
assign_features_to_blocks(blocks, self.g, self.textset, self.ntype)
return blocks

@utils.benchmark('time', 36000)
@utils.benchmark('time', 600)
@utils.parametrize('data', ['nowplaying_rs'])
def track_time(data):
dataset = utils.process_data(data)
Expand All @@ -377,8 +377,6 @@ def track_time(data):
num_workers = 0
hidden_dims = 16
lr = 3e-5
num_epochs = 5
batches_per_epoch = 20000

g = dataset[0]
# Sampler
Expand All @@ -398,16 +396,14 @@ def track_time(data):
batch_size=batch_size,
collate_fn=collator.collate_test,
num_workers=num_workers)
dataloader_it = iter(dataloader)

# Model
model = PinSAGEModel(g, item_ntype, textset, hidden_dims, num_layers).to(device)
# Optimizer
opt = torch.optim.Adam(model.parameters(), lr=lr)

model.train()
for batch_id in range(batches_per_epoch):
pos_graph, neg_graph, blocks = next(dataloader_it)
for batch_id, (pos_graph, neg_graph, blocks) in enumerate(dataloader):
# Copy to GPU
for i in range(len(blocks)):
blocks[i] = blocks[i].to(device)
Expand All @@ -419,24 +415,27 @@ def track_time(data):
loss.backward()
opt.step()

if batch_id >= 3:
break

print("start training...")
t0 = time.time()
# For each batch of head-tail-negative triplets...
for epoch_id in range(num_epochs):
model.train()
for batch_id in range(batches_per_epoch):
pos_graph, neg_graph, blocks = next(dataloader_it)
# Copy to GPU
for i in range(len(blocks)):
blocks[i] = blocks[i].to(device)
pos_graph = pos_graph.to(device)
neg_graph = neg_graph.to(device)

loss = model(pos_graph, neg_graph, blocks).mean()
opt.zero_grad()
loss.backward()
opt.step()
for batch_id, (pos_graph, neg_graph, blocks) in enumerate(dataloader):
# Copy to GPU
for i in range(len(blocks)):
blocks[i] = blocks[i].to(device)
pos_graph = pos_graph.to(device)
neg_graph = neg_graph.to(device)

loss = model(pos_graph, neg_graph, blocks).mean()
opt.zero_grad()
loss.backward()
opt.step()

if batch_id >= 10: # time 10 loops
break

t1 = time.time()

return (t1 - t0) / num_epochs
return (t1 - t0) / (batch_id + 1)
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/model_speed/bench_rgcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def forward(self, g, h, r, norm):
h = layer(g, h, r, norm)
return h

@utils.benchmark('time', 3600)
@utils.benchmark('time', 300)
@utils.parametrize('data', ['aifb'])
@utils.parametrize('lowmem', [True, False])
@utils.parametrize('use_type_count', [True, False])
Expand Down
77 changes: 38 additions & 39 deletions benchmarks/benchmarks/model_speed/bench_rgcn_hetero_ns.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def forward(self, h, blocks):
h = layer(block, h)
return h

@utils.benchmark('time', 3600)
@utils.benchmark('time', 600)
@utils.parametrize('data', ['am', 'ogbn-mag'])
def track_time(data):
dataset = utils.process_data(data)
Expand All @@ -249,7 +249,6 @@ def track_time(data):
dropout = 0.5
use_self_loop = True
lr = 0.01
n_epochs = 5

hg = dataset[0]
category = dataset.predict_category
Expand Down Expand Up @@ -284,46 +283,46 @@ def track_time(data):
hg, {category: train_idx}, sampler,
batch_size=batch_size, shuffle=True, num_workers=4)

for epoch in range(1):
model.train()
embed_layer.train()
optimizer.zero_grad()
sparse_optimizer.zero_grad()

for i, (input_nodes, seeds, blocks) in enumerate(loader):
blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category] # we only predict the nodes with type "category"
batch_tic = time.time()
emb = embed_layer(blocks[0])
lbl = labels[seeds].to(device)
emb = {k : e.to(device) for k, e in emb.items()}
logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl)
loss.backward()
optimizer.step()
sparse_optimizer.step()
# dry run
for i, (input_nodes, seeds, blocks) in enumerate(loader):
blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category] # we only predict the nodes with type "category"
batch_tic = time.time()
emb = embed_layer(blocks[0])
lbl = labels[seeds].to(device)
emb = {k : e.to(device) for k, e in emb.items()}
logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl)
loss.backward()
optimizer.step()
sparse_optimizer.step()

if i >= 3:
break

print("start training...")
model.train()
embed_layer.train()
optimizer.zero_grad()
sparse_optimizer.zero_grad()

t0 = time.time()
for epoch in range(n_epochs):
model.train()
embed_layer.train()
optimizer.zero_grad()
sparse_optimizer.zero_grad()

for i, (input_nodes, seeds, blocks) in enumerate(loader):
blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category] # we only predict the nodes with type "category"
batch_tic = time.time()
emb = embed_layer(blocks[0])
lbl = labels[seeds].to(device)
emb = {k : e.to(device) for k, e in emb.items()}
logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl)
loss.backward()
optimizer.step()
sparse_optimizer.step()
for i, (input_nodes, seeds, blocks) in enumerate(loader):
blocks = [blk.to(device) for blk in blocks]
seeds = seeds[category] # we only predict the nodes with type "category"
batch_tic = time.time()
emb = embed_layer(blocks[0])
lbl = labels[seeds].to(device)
emb = {k : e.to(device) for k, e in emb.items()}
logits = model(emb, blocks)[category]
loss = F.cross_entropy(logits, lbl)
loss.backward()
optimizer.step()
sparse_optimizer.step()

if i >= 9: # time 10 loops
break

t1 = time.time()

return (t1 - t0) / n_epochs
return (t1 - t0) / (i + 1)
Loading

0 comments on commit 22ccf43

Please sign in to comment.