Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
adamlerer committed Jan 1, 2017
1 parent 8a70c27 commit c9e2072
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 27 deletions.
8 changes: 4 additions & 4 deletions OpenNMT/onmt/Constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

PAD = 1
UNK = 2
BOS = 3
EOS = 4
PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
Expand Down
6 changes: 4 additions & 2 deletions OpenNMT/onmt/Models.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ def __init__(self, enc, dec):
)

def forward(self, input):
context = self.enc(input[0])
out = self.dec(input[1], context)
src = input[0]
tgt = input[1][:-1] # exclude </s> from target inputs
context = self.enc(src)
out = self.dec(tgt, context)
return out
17 changes: 9 additions & 8 deletions OpenNMT/onmt/Optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
class Optim(object):

def _makeOptimizer(self):
#print(list(self.params))
if self.method == 'sgd':
self.optimizer = optim.SGD(self.params, lr=self.lr)
elif self.method == 'adagrad':
Expand All @@ -16,32 +15,33 @@ def _makeOptimizer(self):
else:
raise RuntimeError("Invalid optim method: " + self.method)

def __init__(self, params, method, lr, lr_decay=1, start_decay_at=None):
self.params = list(params)
def __init__(self, params, method, lr, max_grad_norm, lr_decay=1, start_decay_at=None):
self.params = list(params) # careful: params may be a generator
self.last_ppl = None
self.lr = lr
self.max_grad_norm = max_grad_norm
self.method = method
self.lr_decay = lr_decay
self.start_decay_at = start_decay_at
self.start_decay = False

self._makeOptimizer()


def step(self, params, max_grad_norm):
def step(self):
# Compute gradients norm.
grad_norm = 0
for param in params:
for param in self.params:
grad_norm = grad_norm + math.pow(param.grad.norm(), 2)

grad_norm = math.sqrt(grad_norm)
shrinkage = max_grad_norm / grad_norm
shrinkage = self.max_grad_norm / grad_norm

for param in params:
for param in self.params:
if shrinkage < 1:
param.grad.mul_(shrinkage)

self.optimizer.step()
return grad_norm

# decay learning rate if val perf does not improve or we hit the start_decay_at limit
def updateLearningRate(self, ppl, epoch):
Expand All @@ -52,6 +52,7 @@ def updateLearningRate(self, ppl, epoch):

if self.start_decay:
self.lr = self.lr * self.lr_decay
print("Decaying learning rate to %g" % self.lr)

self.last_ppl = ppl

Expand Down
2 changes: 1 addition & 1 deletion OpenNMT/onmt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import onmt.Constants
import onmt.Models
from onmt.Dataset import Dataset
from onmt.Dataset import Dataset, collate_data
from onmt.Optim import Optim

# return onmt
25 changes: 13 additions & 12 deletions OpenNMT/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
parser.add_argument('-save_model', help="""Model filename (the model will be saved as
<save_model>_epochN_PPL.pt where PPL is the validation perplexity""")
parser.add_argument('-train_from', help="If training from a checkpoint then this is the path to the pretrained model.")
parser.add_argument('-cont', action="store_true", help="If training from a checkpoint, whether to continue the training in the same configuration or not.")
# parser.add_argument('-cont', action="store_true", help="If training from a checkpoint, whether to continue the training in the same configuration or not.")

##
## **Model options**
Expand All @@ -45,7 +45,7 @@
parser.add_argument('-max_batch_size', type=int, default=64, help="Maximum batch size")
parser.add_argument('-epochs', type=int, default=13, help="Number of training epochs")
parser.add_argument('-start_epoch', type=int, default=0, help="If loading from a checkpoint, the epoch from which to start")
parser.add_argument('-start_iteration', type=int, default=0, help="If loading from a checkpoint, the iteration from which to start")
# parser.add_argument('-start_iteration', type=int, default=0, help="If loading from a checkpoint, the iteration from which to start")
# this gives really bad initialization; Xavier better
# parser.add_argument('-param_init', type=int, default=0.1, help="Parameters are initialized over uniform distribution with support (-param_init, param_init)")
parser.add_argument('-optim', default='sgd', help="Optimization method. Possible options are: sgd, adagrad, adadelta, adam")
Expand All @@ -57,7 +57,7 @@
parser.add_argument('-learning_rate_decay', type=int, default=0.5, help="""Decay learning rate by this much if (i) perplexity does not decrease
on the validation set or (ii) epoch has gone past the start_decay_at_limit""")
parser.add_argument('-start_decay_at', default=8, help="Start decay after this epoch")
parser.add_argument('-curriculum', type=int, default=0, help="""For this many epochs, order the minibatches based on source
parser.add_argument('-curriculum', action="store_true", help="""For this many epochs, order the minibatches based on source
sequence length. Sometimes setting this to 1 will increase convergence speed.""")
parser.add_argument('-pre_word_vecs_enc', help="""If a valid path is specified, then this will load
pretrained word embeddings on the encoder side.
Expand Down Expand Up @@ -107,11 +107,12 @@ def makeOne(size):
self.sub += [makeOne(features.size())]

def forward(self, inputs, targets):
targets = targets[1:] # don't predict BOS
if len(self.sub) == 1:
batch_size = targets.nelement()
return self.sub[0](inputs.view(batch_size, -1), targets.view(batch_size))
else:
assert(False)
assert False, "FIXME: features"
loss = Variable(inputs.new(1).zero_())
for sub, input, target in zip(self.sub, inputs, targets):
loss += sub(input, target)
Expand Down Expand Up @@ -145,7 +146,7 @@ def trainModel(model, trainData, validData, dataset):
dataset['dicts']['tgt']['features'])

optim = onmt.Optim(
model.parameters(), opt.optim, opt.learning_rate,
model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm,
lr_decay=opt.learning_rate_decay,
start_decay_at=opt.start_decay_at
)
Expand All @@ -155,12 +156,11 @@ def trainModel(model, trainData, validData, dataset):
def trainEpoch(epoch):

startI = opt.start_iteration
opt.start_iteration = 1

# shuffle mini batch order
#shuffle mini batch order
batchOrder = torch.randperm(len(trainData))

opt.start_iteration = 1

total_loss, report_loss = 0, 0
total_words, report_words = 0, 0
start = time.time()
Expand All @@ -176,17 +176,18 @@ def trainEpoch(epoch):
loss.backward()

# update the parameters
optim.step(model.parameters(), opt.max_grad_norm)
grad_norm = optim.step()

report_loss += loss.data[0]
total_loss += loss.data[0]
num_words = batch[1].data.gt(onmt.Constants.EOS).sum()
total_words += num_words
report_words += num_words
if i % opt.report_every == 0 and i > 0:
print("Done %d/%d batches; %d words; avg loss: %g; %.2g s/batch" %
(i, len(trainData), report_words, report_loss / report_words, (time.time()-start)/i))
print("Epoch %2d, %5d/%5d batches; grad norm: %4.4g; perplexity: %6.4g; %3.0f tokens/s" %
(epoch, i, len(trainData), grad_norm / opt.report_every, math.exp(report_loss / report_words), report_words/(time.time()-start)))
report_loss = report_words = 0
start = time.time()

# if opt.save_every > 0 and ii % opt.save_every == 0:
# checkpoint.saveIteration(ii, epochState, batchOrder, not opt.json_log)
Expand Down Expand Up @@ -243,9 +244,9 @@ def main():

dataset = torch.load(opt.data)


trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.max_batch_size, opt.cuda)
validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.max_batch_size, opt.cuda)

print(' * vocabulary size. source = %d; target = %d' %
(dataset['dicts']['src']['words'].size(), dataset['dicts']['tgt']['words'].size()))
print(' * additional features. source = %d; target = %d' %
Expand Down

0 comments on commit c9e2072

Please sign in to comment.