Skip to content

Commit

Permalink
[transformer] update tests (NVIDIA#1428)
Browse files Browse the repository at this point in the history
  • Loading branch information
Aidyn-A authored Jul 25, 2022
1 parent 208d967 commit e57d9e7
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 40 deletions.
8 changes: 4 additions & 4 deletions tests/L0/run_transformer/test_batch_sampler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from itertools import product
import unittest

import torch
from torch.testing._internal import common_utils
from torch.utils.data import Dataset
from torch.utils.data import RandomSampler
from torch.utils.data import BatchSampler
Expand Down Expand Up @@ -80,7 +80,7 @@ def __iter__(self):

# Samples 8 tensors in total.
# First sample 4 tensors twice, then sample 2 tensors fourth.
class TestBatchSamplerBehavior(unittest.TestCase):
class TestBatchSamplerBehavior(common_utils.TestCase):
def test_batch_sampler_behavior(self):
dataset = MyIterableDataset(0, 100)

Expand All @@ -101,7 +101,7 @@ def test_batch_sampler_behavior(self):
samples2.append(batch)
if i == 4 - 1:
break
torch.testing.assert_close(torch.cat(samples), torch.cat(samples2))
self.assertEqual(torch.cat(samples), torch.cat(samples2))

def test_split_batch(self):

Expand Down Expand Up @@ -139,4 +139,4 @@ def __getitem__(self, index):


if __name__ == "__main__":
unittest.main()
common_utils.run_tests()
4 changes: 2 additions & 2 deletions tests/L0/run_transformer/test_cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def test_cross_entropy(self):
batch_size, sequence_length, vocab_size, logits_scale, seed
)

torch.testing.assert_close(loss_torch, loss_tensor_parallel)
torch.testing.assert_close(grad_torch, grad_tensor_parallel)
self.assertEqual(loss_torch, loss_tensor_parallel)
self.assertEqual(grad_torch, grad_tensor_parallel)

parallel_state.destroy_model_parallel()

Expand Down
2 changes: 1 addition & 1 deletion tests/L0/run_transformer/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_broadcast_data(self):

broadcasted_data = data_utils.broadcast_data(keys, data, torch.int64)
for key in keys:
torch.testing.assert_close(broadcasted_data[key], data_t[key].cuda())
self.assertEqual(broadcasted_data[key], data_t[key].cuda())

parallel_state.destroy_model_parallel()

Expand Down
15 changes: 9 additions & 6 deletions tests/L0/run_transformer/test_fused_softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
Ref: https://github.com/NVIDIA/Megatron-LM/blob/40becfc96c4144985458ac0e0fae45dbb111fbd2/megatron/fused_kernels/tests/test_fused_kernels.py
""" # NOQA
import itertools
import unittest

import torch
from torch.testing._internal import common_utils

from apex.transformer import AttnMaskType
from apex.transformer.functional import FusedScaleMaskSoftmax
Expand All @@ -20,7 +20,7 @@ def attention_mask_func(attention_scores, attention_mask):
)


class TestFusedScaleMaskSoftmax(unittest.TestCase):
class TestFusedScaleMaskSoftmax(common_utils.TestCase):
def _setup_fused_softmax(
self,
input_in_fp16,
Expand Down Expand Up @@ -89,7 +89,7 @@ def test_fused_scale_mask_softmax(self):
mask = torch.randint(0, 2, mask_shape, device="cuda").bool()
expected = fused_fn(attention_scores_0, mask)
actual = torch_fn(attention_scores_1, mask)
torch.testing.assert_close(actual, expected)
self.assertEqual(actual, expected)

g0 = torch.rand_like(actual)
with torch.no_grad():
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_autocast_fused_scale_mask_softmax(self):
with torch.cuda.amp.autocast(dtype=dtype):
actual = fused_fn(attention_scores_0, mask)
self.assertEqual(actual.dtype, dtype)
torch.testing.assert_close(actual, expected)
self.assertEqual(actual, expected)

g0 = torch.rand_like(actual)
with torch.no_grad():
Expand Down Expand Up @@ -174,7 +174,7 @@ def test_fused_upper_triangle_mask_softmax(self):
total_mask = total_mask.repeat((4, 1, 1, 1))
expected = fused_fn(attn_weights_0, total_mask)
actual = torch_fn(attn_weights_1, total_mask)
torch.testing.assert_close(actual, expected)
self.assertEqual(actual, expected)

g0 = torch.randn_like(actual)
with torch.no_grad():
Expand Down Expand Up @@ -208,10 +208,13 @@ def test_autocast_fused_upper_triangle_mask_softmax(self):
actual = fused_fn(attn_weights_0, total_mask)
self.assertEqual(actual.dtype, dtype)
expected = torch_fn(attn_weights_1, total_mask)
torch.testing.assert_close(actual, expected)
self.assertEqual(actual, expected)

g0 = torch.randn_like(actual)
with torch.no_grad():
g1 = g0.clone()
actual.backward(g0)
expected.backward(g1)

if __name__ == "__main__":
common_utils.run_tests()
42 changes: 21 additions & 21 deletions tests/L0/run_transformer/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_all_gather_parity(self) -> None:
group=parallel_state.get_tensor_model_parallel_group(),
)

torch.testing.assert_close(gathered, gathered_for_base)
self.assertEqual(gathered, gathered_for_base)
parallel_state.destroy_model_parallel()

@torch.no_grad()
Expand Down Expand Up @@ -130,8 +130,8 @@ def test_reduce_scatter_parity(self) -> None:
group=parallel_state.get_tensor_model_parallel_group(),
)

torch.testing.assert_close(output, output_for_base)
torch.testing.assert_close(input, torch.cat(input_list))
self.assertEqual(output, output_for_base)
self.assertEqual(input, torch.cat(input_list))
parallel_state.destroy_model_parallel()

def test_parallel_embedding(self) -> None:
Expand Down Expand Up @@ -376,25 +376,25 @@ def _row_parallel_linear_test_impl(

if not accumulation_in_fp16:
if sequence_parallel_enabled:
torch.testing.assert_close(
actual=output,
expected=expected_output.chunk(
self.assertEqual(
x=output,
y=expected_output.chunk(
chunks=tensor_model_parallel_world_size,
dim=0,
)[parallel_state.get_tensor_model_parallel_rank()],
)
else:
torch.testing.assert_close(
actual=output,
expected=expected_output,
self.assertEqual(
x=output,
y=expected_output,
)

grad_attr_name = "main_grad" if gradient_accumulation_fusion else "grad"
# NOTE(mkozuki): Numerical errors seems to be enlarged by tensor model parallel.
if tensor_model_parallel_world_size == 1:
torch.testing.assert_close(
actual=getattr(linear.weight, grad_attr_name),
expected=ref_linear.weight.grad.chunk(
self.assertEqual(
x=getattr(linear.weight, grad_attr_name),
y=ref_linear.weight.grad.chunk(
chunks=tensor_model_parallel_world_size,
dim=0,
)[parallel_state.get_tensor_model_parallel_rank()],
Expand Down Expand Up @@ -520,24 +520,24 @@ def _column_parallel_linear_test_impl(
tensor_model_parallel_world_size,
dim=2,
)[parallel_state.get_tensor_model_parallel_rank()]
torch.testing.assert_close(
actual=output,
expected=chunk,
self.assertEqual(
x=output,
y=chunk,
)
else:
torch.testing.assert_close(
actual=output,
expected=expected_output,
self.assertEqual(
x=output,
y=expected_output,
)

expected_loss = torch.mul(expected_output, dldy).sum()
expected_loss.backward()
grad_attr_name = "main_grad" if gradient_accumulation_fusion else "grad"
# NOTE(mkozuki): Numerical errors seems to be enlarged by tensor model parallel.
if tensor_model_parallel_world_size == 1:
torch.testing.assert_close(
actual=getattr(linear.weight, grad_attr_name),
expected=ref_linear.weight.grad.chunk(
self.assertEqual(
x=getattr(linear.weight, grad_attr_name),
y=ref_linear.weight.grad.chunk(
chunks=tensor_model_parallel_world_size,
dim=0,
)[parallel_state.get_tensor_model_parallel_rank()],
Expand Down
10 changes: 5 additions & 5 deletions tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def _forward_backward_test_impl(

for loss_item in loss:
x = loss_item['avg']
torch.testing.assert_close(x.item() / microbatch_size, target_loss.item())
self.assertEqual(x.item() / microbatch_size, target_loss.item())

if not forward_only:
for vm_id, model_module in enumerate(model):
Expand All @@ -215,10 +215,10 @@ def _forward_backward_test_impl(
param_id = rank // data_parallel_size + vm_id * offset
target_params = target_model[param_id]

torch.testing.assert_close(params[0].cpu(), target_params[0])
torch.testing.assert_close(params[1].cpu(), target_params[1])
torch.testing.assert_close(params[0].grad.cpu() / microbatch_size, target_params[0].grad)
torch.testing.assert_close(params[1].grad.cpu() / microbatch_size, target_params[1].grad)
self.assertEqual(params[0].cpu(), target_params[0])
self.assertEqual(params[1].cpu(), target_params[1])
self.assertEqual(params[0].grad.cpu() / microbatch_size, target_params[0].grad)
self.assertEqual(params[1].grad.cpu() / microbatch_size, target_params[1].grad)

if not forward_only:
for m in model:
Expand Down
2 changes: 1 addition & 1 deletion tests/L0/run_transformer/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_set_cuda_rng_state(self):
torch.randn(size, out=tensor)
result_2 = tensor.clone()

torch.testing.assert_close(result_2, result_1)
self.assertEqual(result_2, result_1)

self.assertEqual(rng_state.sub(rng_state_clone).max(), 0)

Expand Down

0 comments on commit e57d9e7

Please sign in to comment.