Skip to content

Commit

Permalink
working on several PR review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
tnixon committed May 26, 2023
1 parent 6d14b0a commit 75db1eb
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 24 deletions.
2 changes: 1 addition & 1 deletion config/a100_config.json → config/a10_a100_config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"bf16": {
"enabled": "auto"
"enabled": true
},
"optimizer": {
"type": "AdamW",
Expand Down
2 changes: 1 addition & 1 deletion config/v100_config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"fp16": {
"enabled": "auto"
"enabled": true
},
"optimizer": {
"type": "AdamW",
Expand Down
30 changes: 11 additions & 19 deletions train_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,41 +134,34 @@

local_output_dir = os.path.join(local_training_root, checkpoint_dir_name)
dbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name)

# pick an appropriate config file
gpu_family = dbutils.widgets.get("gpu_family")
root_path = os.getcwd()
deepspeed_config = os.path.join(root_path, f"config/{gpu_family}_config.json")

tensorboard_display_dir = f"{local_output_dir}/runs"

print(f"Local Output Dir: {local_output_dir}")
print(f"DBFS Output Dir: {dbfs_output_dir}")
print(f"Tensorboard Display Dir: {tensorboard_display_dir}")

# pick an appropriate config file
gpu_family = dbutils.widgets.get("gpu_family")

config_file_name = "a10_a100_config.json"
if gpu_family == "v100":
config_file_name = "v100_config.json"

deepspeed_config = os.path.join(os.getcwd(), "config", config_file_name)
print(f"Deepspeed config file: {deepspeed_config}")

# configure the batch_size
batch_size = 3
if gpu_family == "a100":
batch_size = 6

# configure CUDA memory management
if gpu_family == "v100":
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

# configure num_gpus, if specified
num_gpus_flag = ""
num_gpus = dbutils.widgets.get("num_gpus")
if num_gpus:
num_gpus = int(num_gpus)
num_gpus_flag = f"--num_gpus={num_gpus}"

# configure floating point arithmetic format
float_format_flag = ""
if gpu_family == "a100":
float_format_flag = "--bf16=True"
elif gpu_family == "v100":
float_format_flag = "--fp16=True"

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# COMMAND ----------
Expand All @@ -193,8 +186,7 @@
--eval-steps 50 \
--warmup-steps 50 \
--test-size 200 \
--lr 5e-6 \
{float_format_flag}
--lr 5e-6

# COMMAND ----------

Expand Down
5 changes: 2 additions & 3 deletions training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ def train(
deepspeed: str,
gradient_checkpointing: bool,
local_rank: str,
fp16: bool,
bf16: bool,
logging_steps: int,
save_steps: int,
Expand Down Expand Up @@ -233,7 +232,8 @@ def train(
tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)

assert not(fp16 and bf16), "You cannot specify both fp16 and bf16 optimizations at the same time!"
# enable fp16 if not bf16
fp16 = not bf16

if not dbfs_output_dir:
logger.warn("Will NOT save to DBFS")
Expand Down Expand Up @@ -320,7 +320,6 @@ def train(
help="Provided by deepspeed to identify which instance this process is when performing multi-GPU training.",
)
@click.option("--bf16", type=bool, default=None, help="Whether to use bf16 (preferred on A100's).")
@click.option("--fp16", type=bool, default=None, help="Whether to use fp16 (preferred on V100's).")
def main(**kwargs):
train(**kwargs)

Expand Down

0 comments on commit 75db1eb

Please sign in to comment.