Skip to content

Commit

Permalink
Merge branch 'lmcafee/converter-tests' into 'main'
Browse files Browse the repository at this point in the history
Checkpoint model converter tests.

See merge request ADLR/megatron-lm!1889
  • Loading branch information
ericharper committed Oct 21, 2024
2 parents 775ed0d + 68ad185 commit 563d5d1
Show file tree
Hide file tree
Showing 6 changed files with 668 additions and 5 deletions.
10 changes: 10 additions & 0 deletions megatron/core/num_microbatches_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ def update_num_microbatches(
_GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples, consistency_check, verbose)


def unset_num_microbatches_calculator():
"""Unset microbatches calculator.
Useful for multiple runs. See `tests/unit_tests/ckpt_converter/test_ckpt_converter.py`
for an example.
"""
global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None


def init_num_microbatches_calculator(
rank: int,
rampup_batch_size: Optional[List[int]],
Expand Down
31 changes: 30 additions & 1 deletion megatron/training/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch

from megatron.core import Timers
from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator
from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator, unset_num_microbatches_calculator
from megatron.training import dist_signal_handler
from megatron.training.tokenizer import build_tokenizer

Expand Down Expand Up @@ -101,6 +101,35 @@ def set_global_variables(args, build_tokenizer=True):
_set_signal_handler()


def unset_global_variables():
"""Unset global vars.
Useful for multiple runs. See `tests/unit_tests/ckpt_converter/test_ckpt_converter.py` for an example.
"""

global _GLOBAL_ARGS
global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
global _GLOBAL_TOKENIZER
global _GLOBAL_TENSORBOARD_WRITER
global _GLOBAL_WANDB_WRITER
global _GLOBAL_ONE_LOGGER
global _GLOBAL_ADLR_AUTORESUME
global _GLOBAL_TIMERS
global _GLOBAL_SIGNAL_HANDLER

_GLOBAL_ARGS = None
_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
_GLOBAL_TOKENIZER = None
_GLOBAL_TENSORBOARD_WRITER = None
_GLOBAL_WANDB_WRITER = None
_GLOBAL_ONE_LOGGER = None
_GLOBAL_ADLR_AUTORESUME = None
_GLOBAL_TIMERS = None
_GLOBAL_SIGNAL_HANDLER = None

unset_num_microbatches_calculator()


def set_args(args):
global _GLOBAL_ARGS
_GLOBAL_ARGS = args
Expand Down
22 changes: 22 additions & 0 deletions tests/functional_tests/jet_recipes/common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
model: common
build: mcore-pyt-{environment}
nodes: 1
gpus: 8
script: |-
ls
cd /opt/megatron-lm
torchrun --nproc_per_node=8 -m tests.functional_tests.test_cases.common.{test_case}
products:
- scope: [mr]
environment: [lts, dev]
platforms: [dgx_a100]
time_limit: [12000]
test_case:
- ckpt_converter
Loading

0 comments on commit 563d5d1

Please sign in to comment.