-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
240 lines (196 loc) · 9.55 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Authors: Andrea Azzarone <[email protected]>
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import numpy as np
import os
from progress.bar import Bar
from six.moves import xrange # pylint: disable=redefined-builtin
from six.moves import zip # pylint: disable=redefined-builtin
import tensorflow as tf
import tensorflow_fold.public.blocks as td
flags = tf.app.flags
FLAGS = flags.FLAGS
td.define_plan_flags(default_plan_name='w2v-lstm')
# Gnerig options
flags.DEFINE_string('train_set', 'data/glove.6B/glove.6B.50d.train.txt', 'Filepath to trainset.')
flags.DEFINE_string('dev_set', 'data/glove.6B/glove.6B.50d.dev.txt', 'Filepath to devset.')
flags.DEFINE_string('test_set', '', 'Filepath to testset.')
flags.DEFINE_string('test_output', '', 'Filepath to output file.')
flags.DEFINE_integer('char_embedding_size', 8, 'Size of the char embeddings.')
flags.DEFINE_integer('word_embedding_size', 50, 'Size of the word embeddings.')
# LSTM specific flags
flags.DEFINE_integer('num_units', 150, 'Size of LSTM memory.')
flags.DEFINE_float('forget_bias', 1.0, 'The bias added to forget gates.')
flags.DEFINE_boolean('layer_norm', True, 'If True, layer normalization will be applied.')
flags.DEFINE_float('norm_gain', 1.0, 'The layer normalization gain initial value. If layer_norm has been set to False, this argument will be ignored.')
flags.DEFINE_float('norm_shift', 0.0, 'The layer normalization shift initial value. If layer_norm has been set to False, this argument will be ignored.')
flags.DEFINE_float('dropout_keep_prob', 1.0, 'Float between 0 and 1 representing the recurrent dropout probability value. If 1.0, no dropout will be applied.')
flags.DEFINE_integer('dropout_prob_seed', None, 'The randomness seed.')
def load_char2index():
filepath = os.path.join(os.path.dirname(FLAGS.train_set), 'chars.txt')
assert os.path.exists(filepath), "%s does not exists." % filepath
with codecs.open(filepath, encoding='utf-8') as f:
lines = f.read().splitlines()
assert len(
lines) > 0, "Invald char vocabulary file: %s has invalid lenght." % filepath
char2index = dict()
for i, char in enumerate(lines[1:]):
char2index[char] = i
return char2index
def char2index(vocab, char):
if char in vocab:
return vocab[char]
else:
return len(vocab)
def load_trainset():
return load_set(FLAGS.train_set)
def load_devset():
return load_set(FLAGS.dev_set)
def load_testset():
filename = FLAGS.test_set
words = []
with codecs.open(filename, encoding='utf-8') as f:
bar = Bar('Loading dataset %s' % filename, max=get_size_set(filename))
for word in f:
bar.next()
word = word.strip()
if not word: continue
words.append(word)
bar.finish()
return words
def get_size_set(filename):
with codecs.open(filename, 'r', encoding='utf-8') as f:
lines = f.readlines()
return len(lines)
def load_set(filename):
words = []
embeddings = []
with codecs.open(filename, encoding='utf-8') as f:
bar = Bar('Loading dataset %s' % filename, max=get_size_set(filename))
for line in f:
bar.next()
line = line.strip()
if not line: continue
word, vec = line.split(u' ', 1)
emb = np.array(vec.split(), dtype=np.float32)
if len(emb) != FLAGS.word_embedding_size: continue
words.append(word)
embeddings.append(np.array(vec.split(), dtype=np.float32))
bar.finish()
return words, embeddings
def setup_plan(plan):
# Save used paramteres
with open(os.path.join(FLAGS.logdir_base, str(FLAGS.run_id)+'-params.txt'), 'w') as f:
f.write(str(FLAGS.__dict__['__flags']))
# Convert a word in a list of integers using chars.txt
vocab = load_char2index()
word2integers = td.InputTransform(lambda s: [char2index(vocab, c) for c in s])
# Create a placeholder for dropout, if we are in train mode.
keep_prob = tf.placeholder_with_default(1.0, [], name='keep_prob')
# The lstm cell
fw_char_cell = td.ScopedLayer(
tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=FLAGS.num_units,
forget_bias=FLAGS.forget_bias,
layer_norm=FLAGS.layer_norm,
norm_gain=FLAGS.norm_gain,
norm_shift=FLAGS.norm_shift,
dropout_keep_prob=keep_prob,
dropout_prob_seed=FLAGS.dropout_prob_seed), 'fw_char_cell')
bw_char_cell = td.ScopedLayer(
tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=FLAGS.num_units,
forget_bias=FLAGS.forget_bias,
layer_norm=FLAGS.layer_norm,
norm_gain=FLAGS.norm_gain,
norm_shift=FLAGS.norm_shift,
dropout_keep_prob=keep_prob,
dropout_prob_seed=FLAGS.dropout_prob_seed), 'bw_char_cell')
# int -> char embedding (+1 for unk values)
char_embedding = td.Scalar('int32') >> td.Function(td.Embedding(len(vocab) + 1, FLAGS.char_embedding_size))
# word -> matrix of char embeddings
word2matrix = word2integers >> td.Map(char_embedding)
# word -> word embedding
fw_pass = (td.RNN(fw_char_cell) >>
td.GetItem(1) >> td.GetItem(1))
reverse_word = td.Slice(step=-1)
bw_pass = (reverse_word >>
td.RNN(bw_char_cell) >>
td.GetItem(1) >> td.GetItem(1))
# Bidirectional lstm
word_embedding = (word2matrix >>
td.AllOf(fw_pass, bw_pass) >>
td.Concat() >>
td.Function(td.FC(num_units_out=FLAGS.word_embedding_size, activation=tf.tanh)))
sess = tf.InteractiveSession()
if plan.mode == plan.mode_keys.INFER:
# In inference mode, we run the model directly on words.
plan.compiler = td.Compiler.create(word_embedding)
embedding_pred, = plan.compiler.output_tensors
else:
# In training/eval mode, we run the model on (word, embedding) pairs.
plan.compiler = td.Compiler.create(
td.Record((word_embedding, td.Vector(FLAGS.word_embedding_size))))
embedding_pred, embedding_true = plan.compiler.output_tensors
if plan.mode == plan.mode_keys.INFER:
#results = list()
def key_fn(sample):
return sample
def results_fn(results):
with codecs.open(FLAGS.test_output, 'w', encoding='utf-8-sig') as f:
for result in results:
word, embedding = result
f.write(word)
f.write(u' ')
f.write(u' '.join(str(x).encode("utf-8").decode("utf-8") for x in embedding[0].tolist()))
f.write(u'\n')
plan.examples = load_testset()
plan.outputs = [embedding_pred]
plan.key_fn = key_fn
plan.results_fn = results_fn
else:
trainset_words, trainset_embeddings = load_trainset()
devset_words, devset_embeddings = load_devset()
# Create loss tensor, and add it to the plan.
loss_x = tf.losses.mean_squared_error(embedding_true, embedding_pred)
starter_learning_rate = 0.1
learning_rate = tf.train.exponential_decay(starter_learning_rate, plan.global_step,
100, 0.96, staircase=True)
loss = tf.Print(loss_x, [learning_rate, loss_x])
plan.losses['mse'] = loss
optr = tf.train.GradientDescentOptimizer(learning_rate)
plan.train_op = optr.minimize(plan.losses['mse'], plan.global_step)
# collect all trainable variables
#tvars = tf.trainable_variables()
#grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, tvars), 5.0)
#optimizer = tf.train.AdagradOptimizer(0.09)
#plan.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=plan.global_step)
if plan.mode == plan.mode_keys.TRAIN:
plan.examples = zip(trainset_words, trainset_embeddings)
plan.dev_examples = zip(devset_words, devset_embeddings)
# Turn dropout on for training, off for validation.
plan.train_feeds[keep_prob] = FLAGS.dropout_keep_prob
plan.report_loss = lambda step, loss: print(step, loss)
else:
assert plan.mode == plan.mode_keys.EVAL
# We evaluate on devset because we don't have a true testset.
plan.examples = zip(devset_words, devset_embeddings)
def main(_):
assert 0 < FLAGS.dropout_keep_prob <= 1, '--keep_prob must be in (0, 1]'
td.Plan.create_from_flags(setup_plan).run()
if __name__ == '__main__':
tf.app.run()