diff --git a/tests/L0/run_transformer/test_batch_sampler.py b/tests/L0/run_transformer/test_batch_sampler.py index ea9bd8c47..52175d53a 100644 --- a/tests/L0/run_transformer/test_batch_sampler.py +++ b/tests/L0/run_transformer/test_batch_sampler.py @@ -1,7 +1,7 @@ from itertools import product -import unittest import torch +from torch.testing._internal import common_utils from torch.utils.data import Dataset from torch.utils.data import RandomSampler from torch.utils.data import BatchSampler @@ -80,7 +80,7 @@ def __iter__(self): # Samples 8 tensors in total. # First sample 4 tensors twice, then sample 2 tensors fourth. -class TestBatchSamplerBehavior(unittest.TestCase): +class TestBatchSamplerBehavior(common_utils.TestCase): def test_batch_sampler_behavior(self): dataset = MyIterableDataset(0, 100) @@ -101,7 +101,7 @@ def test_batch_sampler_behavior(self): samples2.append(batch) if i == 4 - 1: break - torch.testing.assert_close(torch.cat(samples), torch.cat(samples2)) + self.assertEqual(torch.cat(samples), torch.cat(samples2)) def test_split_batch(self): @@ -139,4 +139,4 @@ def __getitem__(self, index): if __name__ == "__main__": - unittest.main() + common_utils.run_tests() diff --git a/tests/L0/run_transformer/test_cross_entropy.py b/tests/L0/run_transformer/test_cross_entropy.py index bbeabe772..1f5162876 100644 --- a/tests/L0/run_transformer/test_cross_entropy.py +++ b/tests/L0/run_transformer/test_cross_entropy.py @@ -80,8 +80,8 @@ def test_cross_entropy(self): batch_size, sequence_length, vocab_size, logits_scale, seed ) - torch.testing.assert_close(loss_torch, loss_tensor_parallel) - torch.testing.assert_close(grad_torch, grad_tensor_parallel) + self.assertEqual(loss_torch, loss_tensor_parallel) + self.assertEqual(grad_torch, grad_tensor_parallel) parallel_state.destroy_model_parallel() diff --git a/tests/L0/run_transformer/test_data.py b/tests/L0/run_transformer/test_data.py index de15da3f6..38dc752e3 100644 --- a/tests/L0/run_transformer/test_data.py +++ b/tests/L0/run_transformer/test_data.py @@ -51,7 +51,7 @@ def test_broadcast_data(self): broadcasted_data = data_utils.broadcast_data(keys, data, torch.int64) for key in keys: - torch.testing.assert_close(broadcasted_data[key], data_t[key].cuda()) + self.assertEqual(broadcasted_data[key], data_t[key].cuda()) parallel_state.destroy_model_parallel() diff --git a/tests/L0/run_transformer/test_fused_softmax.py b/tests/L0/run_transformer/test_fused_softmax.py index 3e5d180bb..278df69f7 100644 --- a/tests/L0/run_transformer/test_fused_softmax.py +++ b/tests/L0/run_transformer/test_fused_softmax.py @@ -3,9 +3,9 @@ Ref: https://github.com/NVIDIA/Megatron-LM/blob/40becfc96c4144985458ac0e0fae45dbb111fbd2/megatron/fused_kernels/tests/test_fused_kernels.py """ # NOQA import itertools -import unittest import torch +from torch.testing._internal import common_utils from apex.transformer import AttnMaskType from apex.transformer.functional import FusedScaleMaskSoftmax @@ -20,7 +20,7 @@ def attention_mask_func(attention_scores, attention_mask): ) -class TestFusedScaleMaskSoftmax(unittest.TestCase): +class TestFusedScaleMaskSoftmax(common_utils.TestCase): def _setup_fused_softmax( self, input_in_fp16, @@ -89,7 +89,7 @@ def test_fused_scale_mask_softmax(self): mask = torch.randint(0, 2, mask_shape, device="cuda").bool() expected = fused_fn(attention_scores_0, mask) actual = torch_fn(attention_scores_1, mask) - torch.testing.assert_close(actual, expected) + self.assertEqual(actual, expected) g0 = torch.rand_like(actual) with torch.no_grad(): @@ -119,7 +119,7 @@ def test_autocast_fused_scale_mask_softmax(self): with torch.cuda.amp.autocast(dtype=dtype): actual = fused_fn(attention_scores_0, mask) self.assertEqual(actual.dtype, dtype) - torch.testing.assert_close(actual, expected) + self.assertEqual(actual, expected) g0 = torch.rand_like(actual) with torch.no_grad(): @@ -174,7 +174,7 @@ def test_fused_upper_triangle_mask_softmax(self): total_mask = total_mask.repeat((4, 1, 1, 1)) expected = fused_fn(attn_weights_0, total_mask) actual = torch_fn(attn_weights_1, total_mask) - torch.testing.assert_close(actual, expected) + self.assertEqual(actual, expected) g0 = torch.randn_like(actual) with torch.no_grad(): @@ -208,10 +208,13 @@ def test_autocast_fused_upper_triangle_mask_softmax(self): actual = fused_fn(attn_weights_0, total_mask) self.assertEqual(actual.dtype, dtype) expected = torch_fn(attn_weights_1, total_mask) - torch.testing.assert_close(actual, expected) + self.assertEqual(actual, expected) g0 = torch.randn_like(actual) with torch.no_grad(): g1 = g0.clone() actual.backward(g0) expected.backward(g1) + +if __name__ == "__main__": + common_utils.run_tests() diff --git a/tests/L0/run_transformer/test_layers.py b/tests/L0/run_transformer/test_layers.py index da1520622..b3b2eb2fc 100644 --- a/tests/L0/run_transformer/test_layers.py +++ b/tests/L0/run_transformer/test_layers.py @@ -82,7 +82,7 @@ def test_all_gather_parity(self) -> None: group=parallel_state.get_tensor_model_parallel_group(), ) - torch.testing.assert_close(gathered, gathered_for_base) + self.assertEqual(gathered, gathered_for_base) parallel_state.destroy_model_parallel() @torch.no_grad() @@ -130,8 +130,8 @@ def test_reduce_scatter_parity(self) -> None: group=parallel_state.get_tensor_model_parallel_group(), ) - torch.testing.assert_close(output, output_for_base) - torch.testing.assert_close(input, torch.cat(input_list)) + self.assertEqual(output, output_for_base) + self.assertEqual(input, torch.cat(input_list)) parallel_state.destroy_model_parallel() def test_parallel_embedding(self) -> None: @@ -376,25 +376,25 @@ def _row_parallel_linear_test_impl( if not accumulation_in_fp16: if sequence_parallel_enabled: - torch.testing.assert_close( - actual=output, - expected=expected_output.chunk( + self.assertEqual( + x=output, + y=expected_output.chunk( chunks=tensor_model_parallel_world_size, dim=0, )[parallel_state.get_tensor_model_parallel_rank()], ) else: - torch.testing.assert_close( - actual=output, - expected=expected_output, + self.assertEqual( + x=output, + y=expected_output, ) grad_attr_name = "main_grad" if gradient_accumulation_fusion else "grad" # NOTE(mkozuki): Numerical errors seems to be enlarged by tensor model parallel. if tensor_model_parallel_world_size == 1: - torch.testing.assert_close( - actual=getattr(linear.weight, grad_attr_name), - expected=ref_linear.weight.grad.chunk( + self.assertEqual( + x=getattr(linear.weight, grad_attr_name), + y=ref_linear.weight.grad.chunk( chunks=tensor_model_parallel_world_size, dim=0, )[parallel_state.get_tensor_model_parallel_rank()], @@ -520,14 +520,14 @@ def _column_parallel_linear_test_impl( tensor_model_parallel_world_size, dim=2, )[parallel_state.get_tensor_model_parallel_rank()] - torch.testing.assert_close( - actual=output, - expected=chunk, + self.assertEqual( + x=output, + y=chunk, ) else: - torch.testing.assert_close( - actual=output, - expected=expected_output, + self.assertEqual( + x=output, + y=expected_output, ) expected_loss = torch.mul(expected_output, dldy).sum() @@ -535,9 +535,9 @@ def _column_parallel_linear_test_impl( grad_attr_name = "main_grad" if gradient_accumulation_fusion else "grad" # NOTE(mkozuki): Numerical errors seems to be enlarged by tensor model parallel. if tensor_model_parallel_world_size == 1: - torch.testing.assert_close( - actual=getattr(linear.weight, grad_attr_name), - expected=ref_linear.weight.grad.chunk( + self.assertEqual( + x=getattr(linear.weight, grad_attr_name), + y=ref_linear.weight.grad.chunk( chunks=tensor_model_parallel_world_size, dim=0, )[parallel_state.get_tensor_model_parallel_rank()], diff --git a/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py b/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py index ed449f8c8..a409c40f2 100644 --- a/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py +++ b/tests/L0/run_transformer/test_pipeline_parallel_fwd_bwd.py @@ -205,7 +205,7 @@ def _forward_backward_test_impl( for loss_item in loss: x = loss_item['avg'] - torch.testing.assert_close(x.item() / microbatch_size, target_loss.item()) + self.assertEqual(x.item() / microbatch_size, target_loss.item()) if not forward_only: for vm_id, model_module in enumerate(model): @@ -215,10 +215,10 @@ def _forward_backward_test_impl( param_id = rank // data_parallel_size + vm_id * offset target_params = target_model[param_id] - torch.testing.assert_close(params[0].cpu(), target_params[0]) - torch.testing.assert_close(params[1].cpu(), target_params[1]) - torch.testing.assert_close(params[0].grad.cpu() / microbatch_size, target_params[0].grad) - torch.testing.assert_close(params[1].grad.cpu() / microbatch_size, target_params[1].grad) + self.assertEqual(params[0].cpu(), target_params[0]) + self.assertEqual(params[1].cpu(), target_params[1]) + self.assertEqual(params[0].grad.cpu() / microbatch_size, target_params[0].grad) + self.assertEqual(params[1].grad.cpu() / microbatch_size, target_params[1].grad) if not forward_only: for m in model: diff --git a/tests/L0/run_transformer/test_random.py b/tests/L0/run_transformer/test_random.py index 88d631228..6060f9ed9 100644 --- a/tests/L0/run_transformer/test_random.py +++ b/tests/L0/run_transformer/test_random.py @@ -52,7 +52,7 @@ def test_set_cuda_rng_state(self): torch.randn(size, out=tensor) result_2 = tensor.clone() - torch.testing.assert_close(result_2, result_1) + self.assertEqual(result_2, result_1) self.assertEqual(rng_state.sub(rng_state_clone).max(), 0)