From 37590988a54604a7051d2df371d964806eb56875 Mon Sep 17 00:00:00 2001 From: Huaiwei Sun Date: Wed, 3 Jan 2024 10:04:42 -0800 Subject: [PATCH] Save checkpoints locally to /mnt/local_storage before reporting to train (#42157) 70B finetuning ran into OOD issues because it saved the checkpoints to `/tmp`. We should save it to `/mnt/local_storage` instead. Signed-off-by: Huaiwei Sun --- .../04_finetuning_llms_with_deepspeed/finetune_hf_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/finetune_hf_llm.py b/doc/source/templates/04_finetuning_llms_with_deepspeed/finetune_hf_llm.py index 596b87acf34a..ff8bad3fe02b 100644 --- a/doc/source/templates/04_finetuning_llms_with_deepspeed/finetune_hf_llm.py +++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/finetune_hf_llm.py @@ -487,7 +487,7 @@ def training_function(kwargs: dict): "learning_rate": lr_scheduler.get_lr()[0], } - with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + with tempfile.TemporaryDirectory(dir=args.output_dir) as temp_checkpoint_dir: accelerator.print(f"Saving the model locally at {temp_checkpoint_dir}") accelerator.wait_for_everyone()