Skip to content

Commit

Permalink
fix optimizer choice, use same defalut params for train_gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
kimiyoung committed Jun 29, 2019
1 parent 8f17cfd commit 70cae32
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 20 deletions.
10 changes: 6 additions & 4 deletions model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,19 +122,21 @@ def get_train_op(FLAGS, total_loss, grads_and_vars=None):
learning_rate = tf.where(global_step < FLAGS.warmup_steps,
warmup_lr, decay_lr)

if (FLAGS.weight_decay > 0 and not FLAGS.use_tpu and
FLAGS.num_core_per_host > 1):
raise ValueError("Do not support `weight_decay > 0` with multi-gpu "
"training so far.")

if FLAGS.weight_decay == 0:
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate,
epsilon=FLAGS.adam_epsilon)
elif FLAGS.weight_decay > 0 and FLAGS.num_core_per_host == 1:
else:
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
epsilon=FLAGS.adam_epsilon,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
weight_decay_rate=FLAGS.weight_decay)
else:
raise ValueError("Do not support `weight_decay > 0` with multi-gpu "
"training so far.")

if FLAGS.use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
Expand Down
32 changes: 16 additions & 16 deletions train_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@
help="checkpoint path for initializing the model.")

# Optimization config
flags.DEFINE_float("learning_rate", default=2.5e-4,
flags.DEFINE_float("learning_rate", default=1e-4,
help="Maximum learning rate.")
flags.DEFINE_float("clip", default=0.25,
flags.DEFINE_float("clip", default=1.0,
help="Gradient clipping value.")
# for cosine decay
flags.DEFINE_float("min_lr_ratio", default=0.004,
flags.DEFINE_float("min_lr_ratio", default=0.001,
help="Minimum ratio learning rate.")
flags.DEFINE_integer("warmup_steps", default=0,
help="Number of steps for linear lr warmup.")
Expand All @@ -56,13 +56,13 @@
help="weight decay")

# Training config
flags.DEFINE_integer("train_batch_size", default=60,
flags.DEFINE_integer("train_batch_size", default=16,
help="Size of train batch.")
flags.DEFINE_integer("train_steps", default=100000,
help="Total number of training steps.")
flags.DEFINE_integer("iterations", default=500,
flags.DEFINE_integer("iterations", default=1000,
help="Number of iterations per repeat loop.")
flags.DEFINE_integer("save_steps", default=10000,
flags.DEFINE_integer("save_steps", default=None,
help="number of steps for model checkpointing.")

# Data config
Expand All @@ -73,7 +73,7 @@
"Could be half of seq_len")
flags.DEFINE_bool("bi_data", default=True,
help="Use bidirectional data streams, i.e., forward & backward.")
flags.DEFINE_integer("mask_alpha", default=2,
flags.DEFINE_integer("mask_alpha", default=6,
help="How many tokens to form a group.")
flags.DEFINE_integer("mask_beta", default=1,
help="How many tokens to mask within each group.")
Expand All @@ -86,7 +86,7 @@
flags.DEFINE_integer("n_token", 32000, help="Vocab size")

# Model config
flags.DEFINE_integer("mem_len", default=70,
flags.DEFINE_integer("mem_len", default=0,
help="Number of steps to cache")
flags.DEFINE_bool("same_length", default=False,
help="Same length attention")
Expand All @@ -95,23 +95,23 @@

flags.DEFINE_integer("n_layer", default=6,
help="Number of layers.")
flags.DEFINE_integer("d_model", default=500,
flags.DEFINE_integer("d_model", default=32,
help="Dimension of the model.")
flags.DEFINE_integer("d_embed", default=500,
flags.DEFINE_integer("d_embed", default=32,
help="Dimension of the embeddings.")
flags.DEFINE_integer("n_head", default=10,
flags.DEFINE_integer("n_head", default=4,
help="Number of attention heads.")
flags.DEFINE_integer("d_head", default=50,
flags.DEFINE_integer("d_head", default=8,
help="Dimension of each attention head.")
flags.DEFINE_integer("d_inner", default=1000,
flags.DEFINE_integer("d_inner", default=32,
help="Dimension of inner hidden size in positionwise feed-forward.")
flags.DEFINE_float("dropout", default=0.1,
flags.DEFINE_float("dropout", default=0.0,
help="Dropout rate.")
flags.DEFINE_float("dropatt", default=0.1,
flags.DEFINE_float("dropatt", default=0.0,
help="Attention dropout rate.")
flags.DEFINE_bool("untie_r", default=False,
help="Untie r_w_bias and r_r_bias")
flags.DEFINE_string("summary_type", default="attn",
flags.DEFINE_string("summary_type", default="last",
help="Method used to summarize a sequence into a compact vector.")
flags.DEFINE_string("ff_activation", default="relu",
help="Activation type used in position-wise feed-forward.")
Expand Down

0 comments on commit 70cae32

Please sign in to comment.