Skip to content

Commit

Permalink
setting memory & fp options
Browse files Browse the repository at this point in the history
  • Loading branch information
tnixon committed May 18, 2023
1 parent e021eb4 commit 1ea257f
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions train_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,20 @@
root_path = os.getcwd()
deepspeed_config = os.path.join(root_path, f"config/{gpu_family}_config.json")

tensorboard_display_dir = f"{local_output_dir}/runs"

print(f"Local Output Dir: {local_output_dir}")
print(f"DBFS Output Dir: {dbfs_output_dir}")
print(f"Tensorboard Display Dir: {tensorboard_display_dir}")

# configure the batch_size
batch_size = 3
if gpu_family == "a100":
batch_size = 6
batch_size = 6

# configure CUDA memory management
if gpu_family == "v100":
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

# configure num_gpus, if specified
num_gpus_flag = ""
Expand All @@ -152,11 +162,12 @@
num_gpus = int(num_gpus)
num_gpus_flag = f"--num_gpus={num_gpus}"

tensorboard_display_dir = f"{local_output_dir}/runs"

print(f"Local Output Dir: {local_output_dir}")
print(f"DBFS Output Dir: {dbfs_output_dir}")
print(f"Tensorboard Display Dir: {tensorboard_display_dir}")
# configure floating point arithmetic format
float_format_flag = ""
if gpu_family == "a100" or gpu_family == "a10":
float_format_flag = "--bf16=True"
elif gpu_family == "v100":
float_format_flag = "--fp16=True"

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Expand All @@ -182,7 +193,8 @@
--eval-steps 50 \
--warmup-steps 50 \
--test-size 200 \
--lr 5e-6
--lr 5e-6 \
{float_format_flag}

# COMMAND ----------

Expand Down

0 comments on commit 1ea257f

Please sign in to comment.