default.yaml

# BIOHAZARD -- DO NOT EDIT THIS FILE

label: "default"
description: |
  default configuration
  next line of description
  last line

# SGD parameters
learning_rate: 0.001        # initial learning rate
sgd_learning_rate: 1.0      # SGD can start at a different learning rate (useful for switching between Adam and SGD)
learning_rate_decay_factor: 1.0  # decay the learning rate by this factor at a given frequency
decay_every_n_epoch: null   # self explanatory, can be lower than one (e.g. 0.5 for every half epoch)
decay_after_n_epoch: 0      # start decaying learning rate after this many epochs
decay_if_no_progress: null  # measure average loss over this many checkpoints, and decay if no progress
sgd_after_n_epoch: null     # start using SGD optimizer after this many epochs (instead of Adam/AdaDelta)
min_learning_rate: 0.000001 # stop training when learning rate is smaller than this

# training parameters
max_gradient_norm: 5.0   # clip gradients to this norm (prevents exploding gradient)
steps_per_checkpoint: 10000   # number of SGD updates between each checkpoint
steps_per_eval: 10000    # number of SGD updates between each BLEU eval (on dev set)
eval_burn_in: 0          # minimum number of updates before starting BLEU eval
max_steps: 0             # maximum number of updates before stopping
max_epochs: 0            # maximum number of epochs before stopping
keep_best: 4             # number of best checkpoints to keep (based on BLEU score on dev set)
feed_previous: 0.0       # randomly feed prev output instead of ground truth to decoder during training ([0,1] proba)
optimizer: adam          # which training algorithm to use ('sgd', 'adadelta', or 'adam')
moving_average: null     # TODO

# dropout
pervasive_dropout: False  # same dropout mask for all elements in same batch/same sequence (see Gal, 2015)
rnn_input_dropout: 0.0
rnn_output_dropout: 0.0
rnn_state_dropout: 0.0
initial_state_dropout: 0.0
word_dropout: 0.0
input_layer_dropout: 0.0
output_dropout: 0.0      # TODO
use_dropout: False
layer_norm: False

# initialization parameters
weight_scale: null       # if null, initialize weights to TF defaults, otherwise to normal distribution with this stdev
initializer: null        # if 'uniform' initialize uniformly between [-weight_scale, +weight_scale] instead
orthogonal_init: False   # initialize recurrent connections with an orthogonal matrix

# batch iteration parameters
batch_size: 80           # batch size (during training and greedy decoding)
batch_mode: standard     # standard (cycle through train set) or random (sample from train set)
shuffle: True            # shuffle dataset at each new epoch
read_ahead: 10           # number of batches to read ahead and sort by sequence length (can speed up training)
reverse_input: False     # reverse input sequences

# loss function
loss_function: xent            # 'xent' 

# model (each one of these settings can be defined specifically in 'encoders' and 'decoders', or generally here)
cell_size: 1000          # size of the RNN cells
embedding_size: 620      # size of the embeddings
attn_size: 1000          # size of the attention layer
layers: 1                # number of RNN layers per encoder and decoder
cell_type: LSTM          # LSTM, GRU, DropoutGRU
character_level: False   # character-level sequences
max_len: 50              # max length of the input and output sequences (strongly affects speed and memory usage)
truncate_lines: True     # if True truncate lines which are too long, otherwise just drop them

# encoder settings
bidir: True              # use bidirectional encoders
attention_type: global   # global, local, none, last_state, average
attn_window_size: 0      # window size for local attention mechanism
convolutions: null       # list of convolutions to perform on the input sequence
maxout_stride: null      # maxout layer with this stride on the input sequence (after convolutions)
train_initial_states: True  # whether the initial states of the encoder should be trainable parameters
bidir_projection: False  # project bidirectional encoder states to cell_size (or just keep the concatenation)
time_pooling: null       # perform time pooling (skip states) between the layers of the encoder (list of layers - 1 ratios)
pooling_avg: True        # average or skip consecutive states
binary: False            # use binary input for the encoder (no vocab and no embeddings, see utils.read_binary_features)
attn_filters: 0
attn_filter_length: 0
input_layers: null       # list of fully connected layer sizes, applied before the encoder
attn_temperature: 1.0    # 1.0: true softmax (low values: uniform distribution, high values: argmax)
final_state: last        # last (default), concat_last, average
highway_layers: 0        # number of highway layers before the encoder (after convolutions and maxout)

# decoder settings
tie_embeddings: False     # use transpose of the embedding matrix for output projection (requires 'output_extra_proj')
use_previous_word: True   # use previous word when predicting a new word
attn_prev_word: False     # use the previous word in the attention model
softmax_temperature: 1.0  # TODO: temperature of the output softmax
pred_edits: False         # output is a sequence of edits, apply those edits before decoding/evaluating
conditional_rnn: False    # two-layer decoder, where the 1st layer is used for attention, and the 2nd layer for prediction
generate_first: True      # generate next word before updating state (look->generate->update)
update_first: False       # update state before looking and generating next word
rnn_feed_attn: True       # feed attention context to the RNN's transition fonction
use_lstm_full_state: False # use LSTM's full state for attention and next word prediction
pred_embed_proj: True     # project decoder output to embedding size before projecting to vocab size
pred_deep_layer: False    # add a non-linear transformation just before softmax
pred_maxout_layer: True   # use a maxout layer just before the vocabulary projection and softmax
aggregation_method: concat # how to combine the attention contexts of multiple encoders (concat, sum)

# data
max_train_size: 0        # maximum size of the training data (0 for unlimited)
max_dev_size: 0          # maximum size of the dev data
max_test_size: 0         # maximum size of the test data
data_dir: data           # directory containing the training data
model_dir: model         # directory where the model will be saved (checkpoints and eval outputs)
train_prefix: train      # name of the training corpus
script_dir: scripts      # directory where the scripts are kepts (in particular the scoring scripts)
dev_prefix: [dev]        # names of the development corpora
vocab_prefix: vocab      # name of the vocabulary files
checkpoints: []          # list of checkpoints to load (in this specific order) after main checkpoint

# decoding
score_function: corpus_scores # name of the main scoring function, inside 'evaluation.py' (used for selecting models)
post_process_script: null # path to post-processing script (called before evaluating)
remove_unk: False        # remove UNK symbols from the decoder output
beam_size: 1             # beam size for decoding (decoder is greedy by default)
ensemble: False          # use an ensemble of models while decoding (specified by the --checkpoints parameter)
output: null             # output file for decoding (writes to standard output by default)
len_normalization: 1.0   # length normalization coefficient used in beam-search decoder
early_stopping: True     # reduce beam-size each time a finished hypothesis is encountered (affects decoding speed)
raw_output: False        # output translation hypotheses without any post-processing
average: False           # like ensemble, but instead of averaging the log-probs, average all parameters

# general
gpu_id: 0                # index of the GPU to use (starts at zero)
no_gpu: False            # run on CPU only
allow_growth: True       # allow GPU memory allocation to change during runtime
mem_fraction: 1.0        # maximum fraction of GPU memory to use
freeze_variables: []     # list of variables to freeze during training
log_file: log.txt        # log to this file in addition to standard output
parallel_iterations: 16  # parameter of the decoder's while loop (trade-off speed / memory usage)
swap_memory: True        # parameter of the decoder's while loop (more flexible memory management)
max_to_keep: 1           # keep that many latest checkpoints (null for unlimited)
keep_every_n_hours: 0    # keep checkpoints every n hours
embeddings_on_cpu: True  # save embeddings on main memory instead of GPU memory

encoders:                # this is a list (you can specify several encoders)
  - name: code
    max_len: 200

decoders:                # Each encoder or decoder can redefine its own values for a number of parameters,
  - name: nl             # including `cell_size`, `embedding_size` and `attn_size`