From 1ea257f56394ea2711b313a4ec0f36593259096e Mon Sep 17 00:00:00 2001 From: tnixon Date: Thu, 18 May 2023 05:30:06 +0000 Subject: [PATCH] setting memory & fp options --- train_dolly.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/train_dolly.py b/train_dolly.py index b31b42a..02bd875 100644 --- a/train_dolly.py +++ b/train_dolly.py @@ -140,10 +140,20 @@ root_path = os.getcwd() deepspeed_config = os.path.join(root_path, f"config/{gpu_family}_config.json") +tensorboard_display_dir = f"{local_output_dir}/runs" + +print(f"Local Output Dir: {local_output_dir}") +print(f"DBFS Output Dir: {dbfs_output_dir}") +print(f"Tensorboard Display Dir: {tensorboard_display_dir}") + # configure the batch_size batch_size = 3 if gpu_family == "a100": - batch_size = 6 + batch_size = 6 + +# configure CUDA memory management +if gpu_family == "v100": + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64" # configure num_gpus, if specified num_gpus_flag = "" @@ -152,11 +162,12 @@ num_gpus = int(num_gpus) num_gpus_flag = f"--num_gpus={num_gpus}" -tensorboard_display_dir = f"{local_output_dir}/runs" - -print(f"Local Output Dir: {local_output_dir}") -print(f"DBFS Output Dir: {dbfs_output_dir}") -print(f"Tensorboard Display Dir: {tensorboard_display_dir}") +# configure floating point arithmetic format +float_format_flag = "" +if gpu_family == "a100" or gpu_family == "a10": + float_format_flag = "--bf16=True" +elif gpu_family == "v100": + float_format_flag = "--fp16=True" os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -182,7 +193,8 @@ --eval-steps 50 \ --warmup-steps 50 \ --test-size 200 \ - --lr 5e-6 + --lr 5e-6 \ + {float_format_flag} # COMMAND ----------