-
Notifications
You must be signed in to change notification settings - Fork 0
/
default.yaml
144 lines (129 loc) · 8.76 KB
/
default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# BIOHAZARD -- DO NOT EDIT THIS FILE
label: "default"
description: |
default configuration
next line of description
last line
# SGD parameters
learning_rate: 0.001 # initial learning rate
sgd_learning_rate: 1.0 # SGD can start at a different learning rate (useful for switching between Adam and SGD)
learning_rate_decay_factor: 1.0 # decay the learning rate by this factor at a given frequency
decay_every_n_epoch: null # self explanatory, can be lower than one (e.g. 0.5 for every half epoch)
decay_after_n_epoch: 0 # start decaying learning rate after this many epochs
decay_if_no_progress: null # measure average loss over this many checkpoints, and decay if no progress
sgd_after_n_epoch: null # start using SGD optimizer after this many epochs (instead of Adam/AdaDelta)
min_learning_rate: 0.000001 # stop training when learning rate is smaller than this
# training parameters
max_gradient_norm: 5.0 # clip gradients to this norm (prevents exploding gradient)
steps_per_checkpoint: 10000 # number of SGD updates between each checkpoint
steps_per_eval: 10000 # number of SGD updates between each BLEU eval (on dev set)
eval_burn_in: 0 # minimum number of updates before starting BLEU eval
max_steps: 0 # maximum number of updates before stopping
max_epochs: 0 # maximum number of epochs before stopping
keep_best: 4 # number of best checkpoints to keep (based on BLEU score on dev set)
feed_previous: 0.0 # randomly feed prev output instead of ground truth to decoder during training ([0,1] proba)
optimizer: adam # which training algorithm to use ('sgd', 'adadelta', or 'adam')
moving_average: null # TODO
# dropout
pervasive_dropout: False # same dropout mask for all elements in same batch/same sequence (see Gal, 2015)
rnn_input_dropout: 0.0
rnn_output_dropout: 0.0
rnn_state_dropout: 0.0
initial_state_dropout: 0.0
word_dropout: 0.0
input_layer_dropout: 0.0
output_dropout: 0.0 # TODO
use_dropout: False
layer_norm: False
# initialization parameters
weight_scale: null # if null, initialize weights to TF defaults, otherwise to normal distribution with this stdev
initializer: null # if 'uniform' initialize uniformly between [-weight_scale, +weight_scale] instead
orthogonal_init: False # initialize recurrent connections with an orthogonal matrix
# batch iteration parameters
batch_size: 80 # batch size (during training and greedy decoding)
batch_mode: standard # standard (cycle through train set) or random (sample from train set)
shuffle: True # shuffle dataset at each new epoch
read_ahead: 10 # number of batches to read ahead and sort by sequence length (can speed up training)
reverse_input: False # reverse input sequences
# loss function
loss_function: xent # 'xent'
# model (each one of these settings can be defined specifically in 'encoders' and 'decoders', or generally here)
cell_size: 1000 # size of the RNN cells
embedding_size: 620 # size of the embeddings
attn_size: 1000 # size of the attention layer
layers: 1 # number of RNN layers per encoder and decoder
cell_type: LSTM # LSTM, GRU, DropoutGRU
character_level: False # character-level sequences
max_len: 50 # max length of the input and output sequences (strongly affects speed and memory usage)
truncate_lines: True # if True truncate lines which are too long, otherwise just drop them
# encoder settings
bidir: True # use bidirectional encoders
attention_type: global # global, local, none, last_state, average
attn_window_size: 0 # window size for local attention mechanism
convolutions: null # list of convolutions to perform on the input sequence
maxout_stride: null # maxout layer with this stride on the input sequence (after convolutions)
train_initial_states: True # whether the initial states of the encoder should be trainable parameters
bidir_projection: False # project bidirectional encoder states to cell_size (or just keep the concatenation)
time_pooling: null # perform time pooling (skip states) between the layers of the encoder (list of layers - 1 ratios)
pooling_avg: True # average or skip consecutive states
binary: False # use binary input for the encoder (no vocab and no embeddings, see utils.read_binary_features)
attn_filters: 0
attn_filter_length: 0
input_layers: null # list of fully connected layer sizes, applied before the encoder
attn_temperature: 1.0 # 1.0: true softmax (low values: uniform distribution, high values: argmax)
final_state: last # last (default), concat_last, average
highway_layers: 0 # number of highway layers before the encoder (after convolutions and maxout)
# decoder settings
tie_embeddings: False # use transpose of the embedding matrix for output projection (requires 'output_extra_proj')
use_previous_word: True # use previous word when predicting a new word
attn_prev_word: False # use the previous word in the attention model
softmax_temperature: 1.0 # TODO: temperature of the output softmax
pred_edits: False # output is a sequence of edits, apply those edits before decoding/evaluating
conditional_rnn: False # two-layer decoder, where the 1st layer is used for attention, and the 2nd layer for prediction
generate_first: True # generate next word before updating state (look->generate->update)
update_first: False # update state before looking and generating next word
rnn_feed_attn: True # feed attention context to the RNN's transition fonction
use_lstm_full_state: False # use LSTM's full state for attention and next word prediction
pred_embed_proj: True # project decoder output to embedding size before projecting to vocab size
pred_deep_layer: False # add a non-linear transformation just before softmax
pred_maxout_layer: True # use a maxout layer just before the vocabulary projection and softmax
aggregation_method: concat # how to combine the attention contexts of multiple encoders (concat, sum)
# data
max_train_size: 0 # maximum size of the training data (0 for unlimited)
max_dev_size: 0 # maximum size of the dev data
max_test_size: 0 # maximum size of the test data
data_dir: data # directory containing the training data
model_dir: model # directory where the model will be saved (checkpoints and eval outputs)
train_prefix: train # name of the training corpus
script_dir: scripts # directory where the scripts are kepts (in particular the scoring scripts)
dev_prefix: [dev] # names of the development corpora
vocab_prefix: vocab # name of the vocabulary files
checkpoints: [] # list of checkpoints to load (in this specific order) after main checkpoint
# decoding
score_function: corpus_scores # name of the main scoring function, inside 'evaluation.py' (used for selecting models)
post_process_script: null # path to post-processing script (called before evaluating)
remove_unk: False # remove UNK symbols from the decoder output
beam_size: 1 # beam size for decoding (decoder is greedy by default)
ensemble: False # use an ensemble of models while decoding (specified by the --checkpoints parameter)
output: null # output file for decoding (writes to standard output by default)
len_normalization: 1.0 # length normalization coefficient used in beam-search decoder
early_stopping: True # reduce beam-size each time a finished hypothesis is encountered (affects decoding speed)
raw_output: False # output translation hypotheses without any post-processing
average: False # like ensemble, but instead of averaging the log-probs, average all parameters
# general
gpu_id: 0 # index of the GPU to use (starts at zero)
no_gpu: False # run on CPU only
allow_growth: True # allow GPU memory allocation to change during runtime
mem_fraction: 1.0 # maximum fraction of GPU memory to use
freeze_variables: [] # list of variables to freeze during training
log_file: log.txt # log to this file in addition to standard output
parallel_iterations: 16 # parameter of the decoder's while loop (trade-off speed / memory usage)
swap_memory: True # parameter of the decoder's while loop (more flexible memory management)
max_to_keep: 1 # keep that many latest checkpoints (null for unlimited)
keep_every_n_hours: 0 # keep checkpoints every n hours
embeddings_on_cpu: True # save embeddings on main memory instead of GPU memory
encoders: # this is a list (you can specify several encoders)
- name: code
max_len: 200
decoders: # Each encoder or decoder can redefine its own values for a number of parameters,
- name: nl # including `cell_size`, `embedding_size` and `attn_size`