working on several PR review comments

mshtelma · May 26, 2023 · 75db1eb · 75db1eb
1 parent 6d14b0a
commit 75db1eb
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 24 deletions.
diff --git a/config/a100_config.json → config/a10_a100_config.json b/config/a100_config.json → config/a10_a100_config.json
@@ -1,6 +1,6 @@
 {
     "bf16": {
-      "enabled": "auto"
+      "enabled": true
     },
     "optimizer": {
       "type": "AdamW",

diff --git a/config/v100_config.json b/config/v100_config.json
@@ -1,6 +1,6 @@
 {
     "fp16": {
-      "enabled": "auto"
+      "enabled": true
     },
     "optimizer": {
       "type": "AdamW",

diff --git a/train_dolly.py b/train_dolly.py
@@ -134,41 +134,34 @@
 
 local_output_dir = os.path.join(local_training_root, checkpoint_dir_name)
 dbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name)
-
-# pick an appropriate config file
-gpu_family = dbutils.widgets.get("gpu_family")
-root_path = os.getcwd()
-deepspeed_config = os.path.join(root_path, f"config/{gpu_family}_config.json")
-
 tensorboard_display_dir = f"{local_output_dir}/runs"
 
 print(f"Local Output Dir: {local_output_dir}")
 print(f"DBFS Output Dir: {dbfs_output_dir}")
 print(f"Tensorboard Display Dir: {tensorboard_display_dir}")
 
+# pick an appropriate config file
+gpu_family = dbutils.widgets.get("gpu_family")
+
+config_file_name = "a10_a100_config.json"
+if gpu_family == "v100":
+  config_file_name = "v100_config.json"
+
+deepspeed_config = os.path.join(os.getcwd(), "config", config_file_name)
+print(f"Deepspeed config file: {deepspeed_config}")
+
 # configure the batch_size
 batch_size = 3
 if gpu_family == "a100":
     batch_size = 6
 
-# configure CUDA memory management
-if gpu_family == "v100":
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
-
 # configure num_gpus, if specified
 num_gpus_flag = ""
 num_gpus = dbutils.widgets.get("num_gpus")
 if num_gpus:
     num_gpus = int(num_gpus)
     num_gpus_flag = f"--num_gpus={num_gpus}"
 
-# configure floating point arithmetic format
-float_format_flag = ""
-if gpu_family == "a100":
-    float_format_flag = "--bf16=True"
-elif gpu_family == "v100":
-    float_format_flag = "--fp16=True"
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 # COMMAND ----------
@@ -193,8 +186,7 @@
     --eval-steps 50 \
     --warmup-steps 50 \
     --test-size 200 \
-    --lr 5e-6 \
-    {float_format_flag}
+    --lr 5e-6
 
 # COMMAND ----------
 

diff --git a/training/trainer.py b/training/trainer.py
@@ -192,7 +192,6 @@ def train(
     deepspeed: str,
     gradient_checkpointing: bool,
     local_rank: str,
-    fp16: bool,
     bf16: bool,
     logging_steps: int,
     save_steps: int,
@@ -233,7 +232,8 @@ def train(
         tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
     )
 
-    assert not(fp16 and bf16), "You cannot specify both fp16 and bf16 optimizations at the same time!"
+    # enable fp16 if not bf16
+    fp16 = not bf16
 
     if not dbfs_output_dir:
         logger.warn("Will NOT save to DBFS")
@@ -320,7 +320,6 @@ def train(
     help="Provided by deepspeed to identify which instance this process is when performing multi-GPU training.",
 )
 @click.option("--bf16", type=bool, default=None, help="Whether to use bf16 (preferred on A100's).")
-@click.option("--fp16", type=bool, default=None, help="Whether to use fp16 (preferred on V100's).")
 def main(**kwargs):
     train(**kwargs)