diff --git a/OpenNMT/.preprocess.lua.swp b/OpenNMT/.preprocess.lua.swp deleted file mode 100644 index 037acd172e..0000000000 Binary files a/OpenNMT/.preprocess.lua.swp and /dev/null differ diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py index 3b9a5da3b0..aa1a351ad4 100644 --- a/OpenNMT/onmt/Models.py +++ b/OpenNMT/onmt/Models.py @@ -1,136 +1,87 @@ -local function buildEncoder(opt, dicts) - local inputNetwork = onmt.WordEmbedding.new(dicts.words:size(), -- vocab size - opt.word_vec_size, - opt.pre_word_vecs_enc, - opt.fix_word_vecs_enc) - - local inputSize = opt.word_vec_size - - -- Sequences with features. - if #dicts.features > 0 then - local srcFeatEmbedding = onmt.FeaturesEmbedding.new(dicts.features, - opt.feat_vec_exponent, - opt.feat_vec_size, - opt.feat_merge) - - inputNetwork = nn.Sequential() - :add(nn.ParallelTable() - :add(inputNetwork) - :add(srcFeatEmbedding)) - :add(nn.JoinTable(2)) - - inputSize = inputSize + srcFeatEmbedding.outputSize - end - - if opt.brnn then - -- Compute rnn hidden size depending on hidden states merge action. - local rnnSize = opt.rnn_size - if opt.brnn_merge == 'concat' then - if opt.rnn_size % 2 ~= 0 then - error('in concat mode, rnn_size must be divisible by 2') - end - rnnSize = rnnSize / 2 - elseif opt.brnn_merge == 'sum' then - rnnSize = rnnSize - else - error('invalid merge action ' .. opt.brnn_merge) - end - - local rnn = onmt.LSTM.new(opt.layers, inputSize, rnnSize, opt.dropout, opt.residual) - - return onmt.BiEncoder.new(inputNetwork, rnn, opt.brnn_merge) - else - local rnn = onmt.LSTM.new(opt.layers, inputSize, opt.rnn_size, opt.dropout, opt.residual) - - return onmt.Encoder.new(inputNetwork, rnn) - end -end - -local function buildDecoder(opt, dicts, verbose) - local inputNetwork = onmt.WordEmbedding.new(dicts.words:size(), -- vocab size - opt.word_vec_size, - opt.pre_word_vecs_dec, - opt.fix_word_vecs_dec) - - local inputSize = opt.word_vec_size - - local generator - - -- Sequences with features. - if #dicts.features > 0 then - local tgtFeatEmbedding = onmt.FeaturesEmbedding.new(dicts.features, - opt.feat_vec_exponent, - opt.feat_vec_size, - opt.feat_merge) - - inputNetwork = nn.Sequential() - :add(nn.ParallelTable() - :add(inputNetwork) - :add(tgtFeatEmbedding)) - :add(nn.JoinTable(2)) - - inputSize = inputSize + tgtFeatEmbedding.outputSize - - generator = onmt.FeaturesGenerator.new(opt.rnn_size, dicts.words:size(), dicts.features) - else - generator = onmt.Generator.new(opt.rnn_size, dicts.words:size()) - end - - if opt.input_feed == 1 then - if verbose then - print(" * using input feeding") - end - inputSize = inputSize + opt.rnn_size - end - - local rnn = onmt.LSTM.new(opt.layers, inputSize, opt.rnn_size, opt.dropout, opt.residual) - - return onmt.Decoder.new(inputNetwork, rnn, generator, opt.input_feed == 1) -end - ---[[ This is useful when training from a model in parallel mode: each thread must own its model. ]] -local function clonePretrained(model) - local clone = {} - - for k, v in pairs(model) do - if k == 'modules' then - clone.modules = {} - for i = 1, #v do - table.insert(clone.modules, onmt.utils.Tensor.deepClone(v[i])) - end - else - clone[k] = v - end - end - - return clone -end - -local function loadEncoder(pretrained, clone) - local brnn = #pretrained.modules == 2 - - if clone then - pretrained = clonePretrained(pretrained) - end - - if brnn then - return onmt.BiEncoder.load(pretrained) - else - return onmt.Encoder.load(pretrained) - end -end - -local function loadDecoder(pretrained, clone) - if clone then - pretrained = clonePretrained(pretrained) - end - - return onmt.Decoder.load(pretrained) -end - -return { - buildEncoder = buildEncoder, - buildDecoder = buildDecoder, - loadEncoder = loadEncoder, - loadDecoder = loadDecoder -} +def _makeFeatEmbedder(opt, dicts): + return onmt.FeaturesEmbedding(dicts.features, + opt.feat_vec_exponent, + opt.feat_vec_size, + opt.feat_merge) + + +class Encoder(nn.Container): + + def __init__(self, opt, dicts): + input_size = opt.word_vec_size + feat_lut = None + # Sequences with features. + if len(dicts.features) > 0: + feat_lut = _makeFeatEmbedder(opt, dicts) + inputSize = inputSize + feat_lut.outputSize + + super(Encoder, self).__init__( + word_lut=nn.LookupTable(dicts.words.size(), opt.word_vec_size)), + rnn=nn.LSTM(inputSize, opt.rnnSize, + num_layers=opt.layers, + dropout=opt.dropout, + bidirectional=opt.brnn) + ) + + if opt.pre_word_vecs_enc is not None: + pretrained = torch.load(opt.pre_word_vecs_enc) + self.word_lut.weight.copy_(pretrained) + + self.has_features = feat_lut is not None + if self.has_features: + self.add_module('feat_lut', feat_lut) + + def forward(self, input, hidden): + if self.has_features: + word_emb = self.word_lut(input[0]) + feat_emb = self.feat_lut(input[1]) + emb = torch.cat([word_emb, feat_emb], 1) + else: + emb = self.word_lut(input) + + outputs, next_hidden = self.rnn(input, hidden) + return outputs, next_hidden + +class Decoder(nn.Container): + + def __init__(self, opt, dicts): + input_size = opt.word_vec_size + feat_lut = None + # Sequences with features. + if len(dicts.features) > 0: + feat_lut = _makeFeatEmbedder(opt, dicts) + inputSize = inputSize + feat_lut.outputSize + + super(Decoder, self).__init__( + word_lut=nn.LookupTable(dicts.words.size(), opt.word_vec_size)), + rnn=nn.LSTM(inputSize, opt.rnnSize, + num_layers=opt.layers, + dropout=opt.dropout), + attn=GlobalAttention(opt.rnnSize), + dropout=nn.Dropout(opt.dropout) + ) + + if opt.pre_word_vecs_enc is not None: + pretrained = torch.load(opt.pre_word_vecs_dec) + self.word_lut.weight.copy_(pretrained) + + self.has_features = feat_lut is not None + if self.has_features: + self.add_module('feat_lut', feat_lut) + + def forward(self, input, hidden): + if self.has_features: + word_emb = self.word_lut(input[0]) + feat_emb = self.feat_lut(input[1]) + emb = torch.cat([word_emb, feat_emb], 1) + else: + emb = self.word_lut(input) + + if self.input_feed: + emb = torch.cat([emb, input_feed], 1) # 1 step + + outputs, next_hidden = self.rnn(input, hidden) + + attn = self.attn(outputs, context) # FIXME: per timestep? + attn = self.dropout(attn) + return attn, next_hidden diff --git a/OpenNMT/onmt/modules/Decoder.py b/OpenNMT/onmt/modules/Decoder.py index ff73b826c7..7456e3fe9b 100644 --- a/OpenNMT/onmt/modules/Decoder.py +++ b/OpenNMT/onmt/modules/Decoder.py @@ -1,422 +1,422 @@ ---[[ Unit to decode a sequence of output tokens. - - . . . . - | | | | - h_1 => h_2 => h_3 => ... => h_n - | | | | - . . . . - | | | | - h_1 => h_2 => h_3 => ... => h_n - | | | | - | | | | - x_1 x_2 x_3 x_n +#[[ Unit to decode a sequence of output tokens. + + . . . . + | | | | + h_1 => h_2 => h_3 => ... => h_n + | | | | + . . . . + | | | | + h_1 => h_2 => h_3 => ... => h_n + | | | | + | | | | + x_1 x_2 x_3 x_n Inherits from [onmt.Sequencer](onmt+modules+Sequencer). ---]] -local Decoder, parent = torch.class('onmt.Decoder', 'onmt.Sequencer') +#]] +Decoder, parent = torch.class('onmt.Decoder', 'onmt.Sequencer') ---[[ Construct a decoder layer. +#[[ Construct a decoder layer. -Parameters: +Parameters. - * `inputNetwork` - input nn module. - * `rnn` - recurrent module, such as [onmt.LSTM](onmt+modules+LSTM). - * `generator` - optional, an output [onmt.Generator](onmt+modules+Generator). - * `inputFeed` - bool, enable input feeding. ---]] -function Decoder:__init(inputNetwork, rnn, generator, inputFeed) - self.rnn = rnn - self.inputNet = inputNetwork + * `inputNetwork` - input nn module. + * `rnn` - recurrent module, such as [onmt.LSTM](onmt+modules+LSTM). + * `generator` - optional, an output [onmt.Generator](onmt+modules+Generator). + * `inputFeed` - bool, enable input feeding. +#]] +def Decoder.__init(inputNetwork, rnn, generator, inputFeed): + self.rnn = rnn + self.inputNet = inputNetwork - self.args = {} - self.args.rnnSize = self.rnn.outputSize - self.args.numEffectiveLayers = self.rnn.numEffectiveLayers + self.args = {} + self.args.rnnSize = self.rnn.outputSize + self.args.numEffectiveLayers = self.rnn.numEffectiveLayers - self.args.inputIndex = {} - self.args.outputIndex = {} + self.args.inputIndex = {} + self.args.outputIndex = {} - -- Input feeding means the decoder takes an extra - -- vector each time representing the attention at the - -- previous step. - self.args.inputFeed = inputFeed + # Input feeding means the decoder takes an extra + # vector each time representing the attention at the + # previous step. + self.args.inputFeed = inputFeed - parent.__init(self, self:_buildModel()) + parent.__init(self, self._buildModel()) - -- The generator use the output of the decoder sequencer to generate the - -- likelihoods over the target vocabulary. - self.generator = generator - self:add(self.generator) + # The generator use the output of the decoder sequencer to generate the + # likelihoods over the target vocabulary. + self.generator = generator + self.add(self.generator) - self:resetPreallocation() -end - ---[[ Return a new Decoder using the serialized data `pretrained`. ]] -function Decoder.load(pretrained) - local self = torch.factory('onmt.Decoder')() - - self.args = pretrained.args - - parent.__init(self, pretrained.modules[1]) - self.generator = pretrained.modules[2] - self:add(self.generator) + self.resetPreallocation() - self:resetPreallocation() - - return self -end - ---[[ Return data to serialize. ]] -function Decoder:serialize() - return { - modules = self.modules, - args = self.args - } -end - -function Decoder:resetPreallocation() - if self.args.inputFeed then - self.inputFeedProto = torch.Tensor() - end - - -- Prototype for preallocated hidden and cell states. - self.stateProto = torch.Tensor() - - -- Prototype for preallocated output gradients. - self.gradOutputProto = torch.Tensor() - - -- Prototype for preallocated context gradient. - self.gradContextProto = torch.Tensor() -end - ---[[ Build a default one time-step of the decoder - -Returns: An nn-graph mapping - - $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t, con/H, if) => - (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t}, a)}$$ - - Where ${c^l}$ and ${h^l}$ are the hidden and cell states at each layer, - ${x_t}$ is a sparse word to lookup, - ${con/H}$ is the context/source hidden states for attention, - ${if}$ is the input feeding, and - ${a}$ is the context vector computed at this timestep. ---]] -function Decoder:_buildModel() - local inputs = {} - local states = {} - - -- Inputs are previous layers first. - for _ = 1, self.args.numEffectiveLayers do - local h0 = nn.Identity()() -- batchSize x rnnSize - table.insert(inputs, h0) - table.insert(states, h0) - end - - local x = nn.Identity()() -- batchSize - table.insert(inputs, x) - self.args.inputIndex.x = #inputs - - local context = nn.Identity()() -- batchSize x sourceLength x rnnSize - table.insert(inputs, context) - self.args.inputIndex.context = #inputs - - local inputFeed - if self.args.inputFeed then - inputFeed = nn.Identity()() -- batchSize x rnnSize - table.insert(inputs, inputFeed) - self.args.inputIndex.inputFeed = #inputs - end - - -- Compute the input network. - local input = self.inputNet(x) - - -- If set, concatenate previous decoder output. - if self.args.inputFeed then - input = nn.JoinTable(2)({input, inputFeed}) - end - table.insert(states, input) - - -- Forward states and input into the RNN. - local outputs = self.rnn(states) - - -- The output of a subgraph is a node: split it to access the last RNN output. - outputs = { outputs:split(self.args.numEffectiveLayers) } - - -- Compute the attention here using h^L as query. - local attnLayer = onmt.GlobalAttention(self.args.rnnSize) - attnLayer.name = 'decoderAttn' - local attnOutput = attnLayer({outputs[#outputs], context}) - if self.rnn.dropout > 0 then - attnOutput = nn.Dropout(self.rnn.dropout)(attnOutput) - end - table.insert(outputs, attnOutput) - return nn.gModule(inputs, outputs) -end - ---[[ Mask padding means that the attention-layer is constrained to - give zero-weight to padding. This is done by storing a reference - to the softmax attention-layer. - - Parameters: - - * See [onmt.MaskedSoftmax](onmt+modules+MaskedSoftmax). ---]] -function Decoder:maskPadding(sourceSizes, sourceLength, beamSize) - if not self.decoderAttn then - self.network:apply(function (layer) - if layer.name == 'decoderAttn' then - self.decoderAttn = layer - end - end) - end - - self.decoderAttn:replace(function(module) - if module.name == 'softmaxAttn' then - local mod - if sourceSizes ~= nil then - mod = onmt.MaskedSoftmax(sourceSizes, sourceLength, beamSize) - else - mod = nn.SoftMax() - end - - mod.name = 'softmaxAttn' - mod:type(module._type) - self.softmaxAttn = mod - return mod - else - return module - end - end) -end - ---[[ Run one step of the decoder. - -Parameters: - - * `input` - input to be passed to inputNetwork. - * `prevStates` - stack of hidden states (batch x layers*model x rnnSize) - * `context` - encoder output (batch x n x rnnSize) - * `prevOut` - previous distribution (batch x #words) - * `t` - current timestep - -Returns: - - 1. `out` - Top-layer hidden state. - 2. `states` - All states. ---]] -function Decoder:forwardOne(input, prevStates, context, prevOut, t) - local inputs = {} - - -- Create RNN input (see sequencer.lua `buildNetwork('dec')`). - onmt.utils.Table.append(inputs, prevStates) - table.insert(inputs, input) - table.insert(inputs, context) - local inputSize - if torch.type(input) == 'table' then - inputSize = input[1]:size(1) - else - inputSize = input:size(1) - end - - if self.args.inputFeed then - if prevOut == nil then - table.insert(inputs, onmt.utils.Tensor.reuseTensor(self.inputFeedProto, - { inputSize, self.args.rnnSize })) - else - table.insert(inputs, prevOut) - end - end - - -- Remember inputs for the backward pass. - if self.train then - self.inputs[t] = inputs - end - - local outputs = self:net(t):forward(inputs) - local out = outputs[#outputs] - local states = {} - for i = 1, #outputs - 1 do - table.insert(states, outputs[i]) - end - - return out, states -end - ---[[Compute all forward steps. - - Parameters: - - * `batch` - `Batch` object - * `encoderStates` - - * `context` - - * `func` - Calls `func(out, t)` each timestep. ---]] - -function Decoder:forwardAndApply(batch, encoderStates, context, func) - -- TODO: Make this a private method. - - if self.statesProto == nil then - self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - self.stateProto, - { batch.size, self.args.rnnSize }) - end - - local states = onmt.utils.Tensor.copyTensorTable(self.statesProto, encoderStates) - - local prevOut - - for t = 1, batch.targetLength do - prevOut, states = self:forwardOne(batch:getTargetInput(t), states, context, prevOut, t) - func(prevOut, t) - end -end - ---[[Compute all forward steps. - - Parameters: - - * `batch` - a `Batch` object. - * `encoderStates` - a batch of initial decoder states (optional) [0] - * `context` - the context to apply attention to. - - Returns: Table of top hidden state for each timestep. ---]] -function Decoder:forward(batch, encoderStates, context) - encoderStates = encoderStates - or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - onmt.utils.Cuda.convert(torch.Tensor()), - { batch.size, self.args.rnnSize }) - if self.train then - self.inputs = {} - end - - local outputs = {} - - self:forwardAndApply(batch, encoderStates, context, function (out) - table.insert(outputs, out) - end) - - return outputs -end - ---[[ Compute the backward update. - -Parameters: - - * `batch` - a `Batch` object - * `outputs` - expected outputs - * `criterion` - a single target criterion object - - Note: This code runs both the standard backward and criterion forward/backward. - It returns both the gradInputs and the loss. - -- ]] -function Decoder:backward(batch, outputs, criterion) - if self.gradOutputsProto == nil then - self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers + 1, - self.gradOutputProto, - { batch.size, self.args.rnnSize }) - end - - local gradStatesInput = onmt.utils.Tensor.reuseTensorTable(self.gradOutputsProto, - { batch.size, self.args.rnnSize }) - local gradContextInput = onmt.utils.Tensor.reuseTensor(self.gradContextProto, - { batch.size, batch.sourceLength, self.args.rnnSize }) - - local loss = 0 - - for t = batch.targetLength, 1, -1 do - -- Compute decoder output gradients. - -- Note: This would typically be in the forward pass. - local pred = self.generator:forward(outputs[t]) - local output = batch:getTargetOutput(t) - - loss = loss + criterion:forward(pred, output) - -- Compute the criterion gradient. - local genGradOut = criterion:backward(pred, output) - for j = 1, #genGradOut do - genGradOut[j]:div(batch.totalSize) - end +#" Return a new Decoder using the serialized data `pretrained`. " +def Decoder.load(pretrained): + self = torch.factory('onmt.Decoder')() - -- Compute the final layer gradient. - local decGradOut = self.generator:backward(outputs[t], genGradOut) - gradStatesInput[#gradStatesInput]:add(decGradOut) + self.args = pretrained.args - -- Compute the standarad backward. - local gradInput = self:net(t):backward(self.inputs[t], gradStatesInput) - - -- Accumulate encoder output gradients. - gradContextInput:add(gradInput[self.args.inputIndex.context]) - gradStatesInput[#gradStatesInput]:zero() - - -- Accumulate previous output gradients with input feeding gradients. - if self.args.inputFeed and t > 1 then - gradStatesInput[#gradStatesInput]:add(gradInput[self.args.inputIndex.inputFeed]) - end - - -- Prepare next decoder output gradients. - for i = 1, #self.statesProto do - gradStatesInput[i]:copy(gradInput[i]) - end - end + parent.__init(self, pretrained.modules[1]) + self.generator = pretrained.modules[2] + self.add(self.generator) - return gradStatesInput, gradContextInput, loss -end - ---[[ Compute the loss on a batch. - -Parameters: - - * `batch` - a `Batch` to score. - * `encoderStates` - initialization of decoder. - * `context` - the attention context. - * `criterion` - a pointwise criterion. - ---]] -function Decoder:computeLoss(batch, encoderStates, context, criterion) - encoderStates = encoderStates - or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - onmt.utils.Cuda.convert(torch.Tensor()), - { batch.size, self.args.rnnSize }) - - local loss = 0 - self:forwardAndApply(batch, encoderStates, context, function (out, t) - local pred = self.generator:forward(out) - local output = batch:getTargetOutput(t) - loss = loss + criterion:forward(pred, output) - end) - - return loss -end + self.resetPreallocation() + return self ---[[ Compute the score of a batch. -Parameters: - - * `batch` - a `Batch` to score. - * `encoderStates` - initialization of decoder. - * `context` - the attention context. - ---]] -function Decoder:computeScore(batch, encoderStates, context) - encoderStates = encoderStates - or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - onmt.utils.Cuda.convert(torch.Tensor()), - { batch.size, self.args.rnnSize }) - - local score = {} - - self:forwardAndApply(batch, encoderStates, context, function (out, t) - local pred = self.generator:forward(out) - for b = 1, batch.size do - if t <= batch.targetSize[b] then - score[b] = (score[b] or 0) + pred[1][b][batch.targetOutput[t][b]] - end - end - end) +#" Return data to serialize. " +def Decoder.serialize(): + return { + modules = self.modules, + args = self.args + } + + +def Decoder.resetPreallocation(): + if self.args.inputFeed: + self.inputFeedProto = torch.Tensor() + + + # Prototype for preallocated hidden and cell states. + self.stateProto = torch.Tensor() + + # Prototype for preallocated output gradients. + self.gradOutputProto = torch.Tensor() + + # Prototype for preallocated context gradient. + self.gradContextProto = torch.Tensor() + + +#[[ Build a default one time-step of the decoder + +Returns. An nn-graph mapping + + $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t, con/H, if) => + (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t}, a)}$$ + + Where ${c^l}$ and ${h^l}$ are the hidden and cell states at each layer, + ${x_t}$ is a sparse word to lookup, + ${con/H}$ is the context/source hidden states for attention, + ${if}$ is the input feeding, and + ${a}$ is the context vector computed at this timestep. +#]] +def Decoder._buildModel(): + inputs = {} + states = {} + + # Inputs are previous layers first. + for _ = 1, self.args.numEffectiveLayers: + h0 = nn.Identity()() # batchSize x rnnSize + table.insert(inputs, h0) + table.insert(states, h0) + + + x = nn.Identity()() # batchSize + table.insert(inputs, x) + self.args.inputIndex.x = len(inputs) + + context = nn.Identity()() # batchSize x sourceLength x rnnSize + table.insert(inputs, context) + self.args.inputIndex.context = len(inputs) + + inputFeed + if self.args.inputFeed: + inputFeed = nn.Identity()() # batchSize x rnnSize + table.insert(inputs, inputFeed) + self.args.inputIndex.inputFeed = len(inputs) + + + # Compute the input network. + input = self.inputNet(x) + + # If set, concatenate previous decoder output. + if self.args.inputFeed: + input = nn.JoinTable(2)({input, inputFeed}) + + table.insert(states, input) + + # Forward states and input into the RNN. + outputs = self.rnn(states) + + # The output of a subgraph is a node. split it to access the last RNN output. + outputs = { outputs.split(self.args.numEffectiveLayers) } + + # Compute the attention here using h^L as query. + attnLayer = onmt.GlobalAttention(self.args.rnnSize) + attnLayer.name = 'decoderAttn' + attnOutput = attnLayer({outputs[len(outputs]), context}) + if self.rnn.dropout > 0: + attnOutput = nn.Dropout(self.rnn.dropout)(attnOutput) + + table.insert(outputs, attnOutput) + return nn.gModule(inputs, outputs) + + +#[[ Mask padding means that the attention-layer is constrained to + give zero-weight to padding. This is done by storing a reference + to the softmax attention-layer. + + Parameters. + + * See [onmt.MaskedSoftmax](onmt+modules+MaskedSoftmax). +#]] +def Decoder.maskPadding(sourceSizes, sourceLength, beamSize): + if not self.decoderAttn: + self.network.apply(def (layer): + if layer.name == 'decoderAttn': + self.decoderAttn = layer + + ) + + + self.decoderAttn.replace(function(module) + if module.name == 'softmaxAttn': + mod + if sourceSizes != None: + mod = onmt.MaskedSoftmax(sourceSizes, sourceLength, beamSize) + else: + mod = nn.SoftMax() + + + mod.name = 'softmaxAttn' + mod.type(module._type) + self.softmaxAttn = mod + return mod + else: + return module + + ) + + +#[[ Run one step of the decoder. + +Parameters. + + * `input` - input to be passed to inputNetwork. + * `prevStates` - stack of hidden states (batch x layers*model x rnnSize) + * `context` - encoder output (batch x n x rnnSize) + * `prevOut` - previous distribution (batch x len(words)) + * `t` - current timestep + +Returns. + + 1. `out` - Top-layer hidden state. + 2. `states` - All states. +#]] +def Decoder.forwardOne(input, prevStates, context, prevOut, t): + inputs = {} + + # Create RNN input (see sequencer.lua `buildNetwork('dec')`). + onmt.utils.Table.app(inputs, prevStates) + table.insert(inputs, input) + table.insert(inputs, context) + inputSize + if torch.type(input) == 'table': + inputSize = input[1].size(1) + else: + inputSize = input.size(1) + + + if self.args.inputFeed: + if prevOut == None: + table.insert(inputs, onmt.utils.Tensor.reuseTensor(self.inputFeedProto, + { inputSize, self.args.rnnSize })) + else: + table.insert(inputs, prevOut) + + + + # Remember inputs for the backward pass. + if self.train: + self.inputs[t] = inputs + + + outputs = self.net(t):forward(inputs) + out = outputs[len(outputs]) + states = {} + for i = 1, len(outputs) - 1: + table.insert(states, outputs[i]) + + + return out, states + + +#[[Compute all forward steps. + + Parameters. + + * `batch` - `Batch` object + * `encoderStates` - + * `context` - + * `func` - Calls `func(out, t)` each timestep. +#]] + +def Decoder.forwardAndApply(batch, encoderStates, context, func): + # TODO. Make this a private method. + + if self.statesProto == None: + self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + self.stateProto, + { batch.size, self.args.rnnSize }) + + + states = onmt.utils.Tensor.copyTensorTable(self.statesProto, encoderStates) + + prevOut + + for t = 1, batch.targetLength: + prevOut, states = self.forwardOne(batch:getTargetInput(t), states, context, prevOut, t) + func(prevOut, t) + + + +#[[Compute all forward steps. + + Parameters. + + * `batch` - a `Batch` object. + * `encoderStates` - a batch of initial decoder states (optional) [0] + * `context` - the context to apply attention to. + + Returns. Table of top hidden state for each timestep. +#]] +def Decoder.forward(batch, encoderStates, context): + encoderStates = encoderStates + or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + onmt.utils.Cuda.convert(torch.Tensor()), + { batch.size, self.args.rnnSize }) + if self.train: + self.inputs = {} + + + outputs = {} + + self.forwardAndApply(batch, encoderStates, context, def (out): + table.insert(outputs, out) + ) + + return outputs + + +#[[ Compute the backward update. + +Parameters. + + * `batch` - a `Batch` object + * `outputs` - expected outputs + * `criterion` - a single target criterion object + + Note. This code runs both the standard backward and criterion forward/backward. + It returns both the gradInputs and the loss. + # ]] +def Decoder.backward(batch, outputs, criterion): + if self.gradOutputsProto == None: + self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers + 1, + self.gradOutputProto, + { batch.size, self.args.rnnSize }) + + + gradStatesInput = onmt.utils.Tensor.reuseTensorTable(self.gradOutputsProto, + { batch.size, self.args.rnnSize }) + gradContextInput = onmt.utils.Tensor.reuseTensor(self.gradContextProto, + { batch.size, batch.sourceLength, self.args.rnnSize }) + + loss = 0 + + for t = batch.targetLength, 1, -1: + # Compute decoder output gradients. + # Note. This would typically be in the forward pass. + pred = self.generator.forward(outputs[t]) + output = batch.getTargetOutput(t) + + loss = loss + criterion.forward(pred, output) + + # Compute the criterion gradient. + genGradOut = criterion.backward(pred, output) + for j = 1, len(genGradOut): + genGradOut[j].div(batch.totalSize) + + + # Compute the final layer gradient. + decGradOut = self.generator.backward(outputs[t], genGradOut) + gradStatesInput[len(gradStatesInput].add(decGradOut)) + + # Compute the standarad backward. + gradInput = self.net(t):backward(self.inputs[t], gradStatesInput) + + # Accumulate encoder output gradients. + gradContextInput.add(gradInput[self.args.inputIndex.context]) + gradStatesInput[len(gradStatesInput].zero()) + + # Accumulate previous output gradients with input feeding gradients. + if self.args.inputFeed and t > 1: + gradStatesInput[len(gradStatesInput].add(gradInput[self.args.inputIndex.inputFeed])) + + + # Prepare next decoder output gradients. + for i = 1, len(self.statesProto): + gradStatesInput[i].copy(gradInput[i]) + + + + return gradStatesInput, gradContextInput, loss + + +#[[ Compute the loss on a batch. + +Parameters. + + * `batch` - a `Batch` to score. + * `encoderStates` - initialization of decoder. + * `context` - the attention context. + * `criterion` - a pointwise criterion. + +#]] +def Decoder.computeLoss(batch, encoderStates, context, criterion): + encoderStates = encoderStates + or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + onmt.utils.Cuda.convert(torch.Tensor()), + { batch.size, self.args.rnnSize }) + + loss = 0 + self.forwardAndApply(batch, encoderStates, context, def (out, t): + pred = self.generator.forward(out) + output = batch.getTargetOutput(t) + loss = loss + criterion.forward(pred, output) + ) + + return loss + + + +#[[ Compute the score of a batch. + +Parameters. + + * `batch` - a `Batch` to score. + * `encoderStates` - initialization of decoder. + * `context` - the attention context. + +#]] +def Decoder.computeScore(batch, encoderStates, context): + encoderStates = encoderStates + or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + onmt.utils.Cuda.convert(torch.Tensor()), + { batch.size, self.args.rnnSize }) + + score = {} + + self.forwardAndApply(batch, encoderStates, context, def (out, t): + pred = self.generator.forward(out) + for b = 1, batch.size: + if t <= batch.targetSize[b]: + score[b] = (score[b] or 0) + pred[1][b][batch.targetOutput[t][b]] + + + ) + + return score - return score -end diff --git a/OpenNMT/onmt/modules/Encoder.py b/OpenNMT/onmt/modules/Encoder.py index 85d8e32db5..0ade2de292 100644 --- a/OpenNMT/onmt/modules/Encoder.py +++ b/OpenNMT/onmt/modules/Encoder.py @@ -1,232 +1,232 @@ ---[[ Encoder is a unidirectional Sequencer used for the source language. +#[[ Encoder is a unidirectional Sequencer used for the source language. - h_1 => h_2 => h_3 => ... => h_n - | | | | - . . . . - | | | | - h_1 => h_2 => h_3 => ... => h_n - | | | | - | | | | - x_1 x_2 x_3 x_n + h_1 => h_2 => h_3 => ... => h_n + | | | | + . . . . + | | | | + h_1 => h_2 => h_3 => ... => h_n + | | | | + | | | | + x_1 x_2 x_3 x_n Inherits from [onmt.Sequencer](onmt+modules+Sequencer). ---]] -local Encoder, parent = torch.class('onmt.Encoder', 'onmt.Sequencer') +#]] +Encoder, parent = torch.class('onmt.Encoder', 'onmt.Sequencer') ---[[ Construct an encoder layer. +#[[ Construct an encoder layer. -Parameters: +Parameters. - * `inputNetwork` - input module. - * `rnn` - recurrent module. + * `inputNetwork` - input module. + * `rnn` - recurrent module. ]] -function Encoder:__init(inputNetwork, rnn) - self.rnn = rnn - self.inputNet = inputNetwork +def Encoder.__init(inputNetwork, rnn): + self.rnn = rnn + self.inputNet = inputNetwork - self.args = {} - self.args.rnnSize = self.rnn.outputSize - self.args.numEffectiveLayers = self.rnn.numEffectiveLayers + self.args = {} + self.args.rnnSize = self.rnn.outputSize + self.args.numEffectiveLayers = self.rnn.numEffectiveLayers - parent.__init(self, self:_buildModel()) + parent.__init(self, self._buildModel()) - self:resetPreallocation() -end + self.resetPreallocation() ---[[ Return a new Encoder using the serialized data `pretrained`. ]] -function Encoder.load(pretrained) - local self = torch.factory('onmt.Encoder')() - self.args = pretrained.args - parent.__init(self, pretrained.modules[1]) +#" Return a new Encoder using the serialized data `pretrained`. " +def Encoder.load(pretrained): + self = torch.factory('onmt.Encoder')() - self:resetPreallocation() + self.args = pretrained.args + parent.__init(self, pretrained.modules[1]) - return self -end + self.resetPreallocation() ---[[ Return data to serialize. ]] -function Encoder:serialize() - return { - modules = self.modules, - args = self.args - } -end + return self -function Encoder:resetPreallocation() - -- Prototype for preallocated hidden and cell states. - self.stateProto = torch.Tensor() - -- Prototype for preallocated output gradients. - self.gradOutputProto = torch.Tensor() +#" Return data to serialize. " +def Encoder.serialize(): + return { + modules = self.modules, + args = self.args + } + + +def Encoder.resetPreallocation(): + # Prototype for preallocated hidden and cell states. + self.stateProto = torch.Tensor() + + # Prototype for preallocated output gradients. + self.gradOutputProto = torch.Tensor() + + # Prototype for preallocated context vector. + self.contextProto = torch.Tensor() + + +def Encoder.maskPadding(): + self.maskPad = True + + +#[[ Build one time-step of an encoder + +Returns. An nn-graph mapping + + $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t) => + (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t})}$$ + + Where $$c^l$$ and $$h^l$$ are the hidden and cell states at each layer, + $$x_t$$ is a sparse word to lookup. +#]] +def Encoder._buildModel(): + inputs = {} + states = {} + + # Inputs are previous layers first. + for _ = 1, self.args.numEffectiveLayers: + h0 = nn.Identity()() # batchSize x rnnSize + table.insert(inputs, h0) + table.insert(states, h0) + + + # Input word. + x = nn.Identity()() # batchSize + table.insert(inputs, x) + + # Compute input network. + input = self.inputNet(x) + table.insert(states, input) + + # Forward states and input into the RNN. + outputs = self.rnn(states) + return nn.gModule(inputs, { outputs }) + + +#[[Compute the context representation of an input. + +Parameters. + + * `batch` - as defined in batch.lua. + +Returns. + + 1. - final hidden states + 2. - context matrix H +#]] +def Encoder.forward(batch): + + # TODO. Change `batch` to `input`. + + finalStates + outputSize = self.args.rnnSize + + if self.statesProto == None: + self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + self.stateProto, + { batch.size, outputSize }) + + + # Make initial states h_0. + states = onmt.utils.Tensor.reuseTensorTable(self.statesProto, { batch.size, outputSize }) + + # Preallocated output matrix. + context = onmt.utils.Tensor.reuseTensor(self.contextProto, + { batch.size, batch.sourceLength, outputSize }) + + if self.maskPad and not batch.sourceInputPadLeft: + finalStates = onmt.utils.Tensor.recursiveClone(states) + + if self.train: + self.inputs = {} + + + # Act like nn.Sequential and call each clone in a feed-forward + # fashion. + for t = 1, batch.sourceLength: + + # Construct "inputs". Prev states come first then source. + inputs = {} + onmt.utils.Table.app(inputs, states) + table.insert(inputs, batch.getSourceInput(t)) + + if self.train: + # Remember inputs for the backward pass. + self.inputs[t] = inputs + + states = self.net(t):forward(inputs) + + # Special case padding. + if self.maskPad: + for b = 1, batch.size: + if batch.sourceInputPadLeft and t <= batch.sourceLength - batch.sourceSize[b]: + for j = 1, len(states): + states[j][b].zero() + + elif not batch.sourceInputPadLeft and t == batch.sourceSize[b]: + for j = 1, len(states): + finalStates[j][b].copy(states[j][b]) + + + + + + # Copy output (h^L_t = states[len(states])) to context. + context[{{}, t}].copy(states[len(states])) + + + if finalStates == None: + finalStates = states + + + return finalStates, context + + +#[[ Backward pass (only called during training) + + Parameters. + + * `batch` - must be same as for forward + * `gradStatesOutput` gradient of loss wrt last state + * `gradContextOutput` - gradient of loss wrt full context. + + Returns. `gradInputs` of input network. +#]] +def Encoder.backward(batch, gradStatesOutput, gradContextOutput): + # TODO. change this to (input, gradOutput) as in nngraph. + outputSize = self.args.rnnSize + if self.gradOutputsProto == None: + self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, + self.gradOutputProto, + { batch.size, outputSize }) + + + gradStatesInput = onmt.utils.Tensor.copyTensorTable(self.gradOutputsProto, gradStatesOutput) + gradInputs = {} + + for t = batch.sourceLength, 1, -1: + # Add context gradients to last hidden states gradients. + gradStatesInput[len(gradStatesInput].add(gradContextOutput[{{}), t}]) + + gradInput = self.net(t):backward(self.inputs[t], gradStatesInput) + + # Prepare next encoder output gradients. + for i = 1, len(gradStatesInput): + gradStatesInput[i].copy(gradInput[i]) + + + # Gather gradients of all user inputs. + gradInputs[t] = {} + for i = len(gradStatesInput) + 1, #gradInput: + table.insert(gradInputs[t], gradInput[i]) + + + if len(gradInputs[t]) == 1: + gradInputs[t] = gradInputs[t][1] + + + # TODO. make these names clearer. + # Useful if input came from another network. + return gradInputs - -- Prototype for preallocated context vector. - self.contextProto = torch.Tensor() -end - -function Encoder:maskPadding() - self.maskPad = true -end - ---[[ Build one time-step of an encoder - -Returns: An nn-graph mapping - - $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t) => - (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t})}$$ - - Where $$c^l$$ and $$h^l$$ are the hidden and cell states at each layer, - $$x_t$$ is a sparse word to lookup. ---]] -function Encoder:_buildModel() - local inputs = {} - local states = {} - - -- Inputs are previous layers first. - for _ = 1, self.args.numEffectiveLayers do - local h0 = nn.Identity()() -- batchSize x rnnSize - table.insert(inputs, h0) - table.insert(states, h0) - end - -- Input word. - local x = nn.Identity()() -- batchSize - table.insert(inputs, x) - - -- Compute input network. - local input = self.inputNet(x) - table.insert(states, input) - - -- Forward states and input into the RNN. - local outputs = self.rnn(states) - return nn.gModule(inputs, { outputs }) -end - ---[[Compute the context representation of an input. - -Parameters: - - * `batch` - as defined in batch.lua. - -Returns: - - 1. - final hidden states - 2. - context matrix H ---]] -function Encoder:forward(batch) - - -- TODO: Change `batch` to `input`. - - local finalStates - local outputSize = self.args.rnnSize - - if self.statesProto == nil then - self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - self.stateProto, - { batch.size, outputSize }) - end - - -- Make initial states h_0. - local states = onmt.utils.Tensor.reuseTensorTable(self.statesProto, { batch.size, outputSize }) - - -- Preallocated output matrix. - local context = onmt.utils.Tensor.reuseTensor(self.contextProto, - { batch.size, batch.sourceLength, outputSize }) - - if self.maskPad and not batch.sourceInputPadLeft then - finalStates = onmt.utils.Tensor.recursiveClone(states) - end - if self.train then - self.inputs = {} - end - - -- Act like nn.Sequential and call each clone in a feed-forward - -- fashion. - for t = 1, batch.sourceLength do - - -- Construct "inputs". Prev states come first then source. - local inputs = {} - onmt.utils.Table.append(inputs, states) - table.insert(inputs, batch:getSourceInput(t)) - - if self.train then - -- Remember inputs for the backward pass. - self.inputs[t] = inputs - end - states = self:net(t):forward(inputs) - - -- Special case padding. - if self.maskPad then - for b = 1, batch.size do - if batch.sourceInputPadLeft and t <= batch.sourceLength - batch.sourceSize[b] then - for j = 1, #states do - states[j][b]:zero() - end - elseif not batch.sourceInputPadLeft and t == batch.sourceSize[b] then - for j = 1, #states do - finalStates[j][b]:copy(states[j][b]) - end - end - end - end - - -- Copy output (h^L_t = states[#states]) to context. - context[{{}, t}]:copy(states[#states]) - end - - if finalStates == nil then - finalStates = states - end - - return finalStates, context -end - ---[[ Backward pass (only called during training) - - Parameters: - - * `batch` - must be same as for forward - * `gradStatesOutput` gradient of loss wrt last state - * `gradContextOutput` - gradient of loss wrt full context. - - Returns: `gradInputs` of input network. ---]] -function Encoder:backward(batch, gradStatesOutput, gradContextOutput) - -- TODO: change this to (input, gradOutput) as in nngraph. - local outputSize = self.args.rnnSize - if self.gradOutputsProto == nil then - self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers, - self.gradOutputProto, - { batch.size, outputSize }) - end - - local gradStatesInput = onmt.utils.Tensor.copyTensorTable(self.gradOutputsProto, gradStatesOutput) - local gradInputs = {} - - for t = batch.sourceLength, 1, -1 do - -- Add context gradients to last hidden states gradients. - gradStatesInput[#gradStatesInput]:add(gradContextOutput[{{}, t}]) - - local gradInput = self:net(t):backward(self.inputs[t], gradStatesInput) - - -- Prepare next encoder output gradients. - for i = 1, #gradStatesInput do - gradStatesInput[i]:copy(gradInput[i]) - end - - -- Gather gradients of all user inputs. - gradInputs[t] = {} - for i = #gradStatesInput + 1, #gradInput do - table.insert(gradInputs[t], gradInput[i]) - end - - if #gradInputs[t] == 1 then - gradInputs[t] = gradInputs[t][1] - end - end - -- TODO: make these names clearer. - -- Useful if input came from another network. - return gradInputs - -end diff --git a/OpenNMT/onmt/modules/FeaturesEmbedding.py b/OpenNMT/onmt/modules/FeaturesEmbedding.py index bf3922f524..5e3c4fa55a 100644 --- a/OpenNMT/onmt/modules/FeaturesEmbedding.py +++ b/OpenNMT/onmt/modules/FeaturesEmbedding.py @@ -1,63 +1,29 @@ ---[[ - A nngraph unit that maps features ids to embeddings. When using multiple - features this can be the concatenation or the sum of each individual embedding. -]] -local FeaturesEmbedding, parent = torch.class('onmt.FeaturesEmbedding', 'nn.Container') - -function FeaturesEmbedding:__init(dicts, dimExponent, dim, merge) - parent.__init(self) - - self.net = self:_buildModel(dicts, dimExponent, dim, merge) - self:add(self.net) -end - -function FeaturesEmbedding:_buildModel(dicts, dimExponent, dim, merge) - local inputs = {} - local output - - if merge == 'sum' then - self.outputSize = dim - else - self.outputSize = 0 - end - - for i = 1, #dicts do - local feat = nn.Identity()() -- batchSize - table.insert(inputs, feat) - - local vocabSize = dicts[i]:size() - local embSize - - if merge == 'sum' then - embSize = self.outputSize - else - embSize = math.floor(vocabSize ^ dimExponent) - self.outputSize = self.outputSize + embSize - end - - local emb = nn.LookupTable(vocabSize, embSize)(feat) - - if not output then - output = emb - elseif merge == 'sum' then - output = nn.CAddTable()({output, emb}) - else - output = nn.JoinTable(2)({output, emb}) - end - end - - return nn.gModule(inputs, {output}) -end - -function FeaturesEmbedding:updateOutput(input) - self.output = self.net:updateOutput(input) - return self.output -end - -function FeaturesEmbedding:updateGradInput(input, gradOutput) - return self.net:updateGradInput(input, gradOutput) -end - -function FeaturesEmbedding:accGradParameters(input, gradOutput, scale) - self.net:accGradParameters(input, gradOutput, scale) -end +class FeaturesEmbeddding(nn.Container): + + def __init__(self, dicts, dimExponent, dim, merge): + super(FeaturesEmbedding, self).__init__(): + + self.merge = merge + self.luts = [] + self.outputSize = dim if merge == 'sum' else 0 + for i, dict in dicts.enumerate(): + vocabSize = dict.size() + if merge == 'sum': + embSize = dim + else: + embSize = math.floor(math.pow(vocabSize, dimExponent)) + self.outputSize += embSize + + lut = nn.LookupTable(vocabSize, embSize) + self.luts += [] + self.add_module('lut_%d' % i, lut) + + def forward(self, input): + embs = [] + for i in range(input.size(1)): + embs += [self.luts[i](input.select(1, i))] + + if self.merge == 'sum': + return sum(embs) + else: + return torch.cat(embs, 1) diff --git a/OpenNMT/onmt/modules/FeaturesGenerator.py b/OpenNMT/onmt/modules/FeaturesGenerator.py index c76812ed43..387658180d 100644 --- a/OpenNMT/onmt/modules/FeaturesGenerator.py +++ b/OpenNMT/onmt/modules/FeaturesGenerator.py @@ -1,53 +1,38 @@ ---[[ Feature decoder generator. Given RNN state, produce categorical distribution over +#[[ Feature decoder generator. Given RNN state, produce categorical distribution over tokens and features. - Implements $$[softmax(W^1 h + b^1), softmax(W^2 h + b^2), ..., softmax(W^n h + b^n)] $$. ---]] + Implements $$[softmax(W^1 h + b^1), softmax(W^2 h + b^2), ..., softmax(W^n h + b^n)] $$. +#]] -local FeaturesGenerator, parent = torch.class('onmt.FeaturesGenerator', 'nn.Container') +FeaturesGenerator, parent = torch.class('onmt.FeaturesGenerator', 'nn.Container') ---[[ -Parameters: +#[[ +Parameters. - * `rnnSize` - Input rnn size. - * `outputSize` - Output size (number of tokens). - * `features` - table of feature sizes. ---]] -function FeaturesGenerator:__init(rnnSize, outputSize, features) - parent.__init(self) - self.net = self:_buildGenerator(rnnSize, outputSize, features) - self:add(self.net) -end + * `rnnSize` - Input rnn size. + * `outputSize` - Output size (number of tokens). + * `features` - table of feature sizes. +#]] +def FeaturesGenerator.__init(rnnSize, outputSize, features): + parent.__init(self) + self.net = self._buildGenerator(rnnSize, outputSize, features) + self.add(self.net) -function FeaturesGenerator:_buildGenerator(rnnSize, outputSize, features) - local generator = nn.ConcatTable() - -- Add default generator. - generator:add(nn.Sequential() - :add(onmt.Generator(rnnSize, outputSize)) - :add(nn.SelectTable(1))) +def FeaturesGenerator._buildGenerator(rnnSize, outputSize, features): + generator = nn.ConcatTable() - -- Add a generator for each target feature. - for i = 1, #features do - generator:add(nn.Sequential() - :add(nn.Linear(rnnSize, features[i]:size())) - :add(nn.LogSoftMax())) - end + # Add default generator. + generator.add(nn.Sequential() + .add(nn.Linear(rnnSize, outputSize)) + .add(nn.LogSoftMax()) + .add(nn.SelectTable(1))) - return generator -end + # Add a generator for each target feature. + for i = 1, len(features): + generator.add(nn.Sequential() + .add(nn.Linear(rnnSize, features[i]:size())) + .add(nn.LogSoftMax())) -function FeaturesGenerator:updateOutput(input) - self.output = self.net:updateOutput(input) - return self.output -end - -function FeaturesGenerator:updateGradInput(input, gradOutput) - self.gradInput = self.net:updateGradInput(input, gradOutput) - return self.gradInput -end - -function FeaturesGenerator:accGradParameters(input, gradOutput, scale) - self.net:accGradParameters(input, gradOutput, scale) -end + return generator diff --git a/OpenNMT/onmt/modules/GlobalAttention.py b/OpenNMT/onmt/modules/GlobalAttention.py index c80b41535d..74442082fc 100644 --- a/OpenNMT/onmt/modules/GlobalAttention.py +++ b/OpenNMT/onmt/modules/GlobalAttention.py @@ -1,74 +1,50 @@ -require('nngraph') - ---[[ Global attention takes a matrix and a query vector. It +""" +Global attention takes a matrix and a query vector. It then computes a parameterized convex combination of the matrix based on the input query. - H_1 H_2 H_3 ... H_n - q q q q - | | | | - \ | | / - ..... - \ | / - a - -Constructs a unit mapping: - $$(H_1 .. H_n, q) => (a)$$ - Where H is of `batch x n x dim` and q is of `batch x dim`. - - The full function is $$\tanh(W_2 [(softmax((W_1 q + b_1) H) H), q] + b_2)$$. - ---]] -local GlobalAttention, parent = torch.class('onmt.GlobalAttention', 'nn.Container') - ---[[A nn-style module computing attention. + H_1 H_2 H_3 ... H_n + q q q q + | | | | + \ | | / + ..... + \ | / + a - Parameters: +Constructs a unit mapping. + $$(H_1 + H_n, q) => (a)$$ + Where H is of `batch x n x dim` and q is of `batch x dim`. - * `dim` - dimension of the context vectors. ---]] -function GlobalAttention:__init(dim) - parent.__init(self) - self.net = self:_buildModel(dim) - self:add(self.net) -end + The full def is $$\tanh(W_2 [(softmax((W_1 q + b_1) H) H), q] + b_2)$$.: -function GlobalAttention:_buildModel(dim) - local inputs = {} - table.insert(inputs, nn.Identity()()) - table.insert(inputs, nn.Identity()()) +""" - local targetT = nn.Linear(dim, dim, false)(inputs[1]) -- batchL x dim - local context = inputs[2] -- batchL x sourceTimesteps x dim +import torch +import torch.nn as nn - -- Get attention. - local attn = nn.MM()({context, nn.Replicate(1,3)(targetT)}) -- batchL x sourceL x 1 - attn = nn.Sum(3)(attn) - local softmaxAttn = nn.SoftMax() - softmaxAttn.name = 'softmaxAttn' - attn = softmaxAttn(attn) - attn = nn.Replicate(1,2)(attn) -- batchL x 1 x sourceL +class GlobalAttention(nn.Container): + def __init__(self, dim): + super(GlobalAttention, self).__init__( + linear_in=nn.Linear(dim, dim, bias=False), + sm=nn.SoftMax(), + linear_out=nn.Linear(dim*2, dim, bias=False), + tanh=nn.Tanh(), + ) - -- Apply attention to context. - local contextCombined = nn.MM()({attn, context}) -- batchL x 1 x dim - contextCombined = nn.Sum(2)(contextCombined) -- batchL x dim - contextCombined = nn.JoinTable(2)({contextCombined, inputs[1]}) -- batchL x dim*2 - local contextOutput = nn.Tanh()(nn.Linear(dim*2, dim, false)(contextCombined)) + def forward(self, input, context): + """ + input: batch x dim + context: batch x sourceL x dim + """ + targetT = self.linear_in(input).unsqueeze(2) # batch x dim x 1 - return nn.gModule(inputs, {contextOutput}) -end + # Get attention + attn = torch.bmm(context, targetT).squeeze(2) # batch x sourceL -function GlobalAttention:updateOutput(input) - self.output = self.net:updateOutput(input) - return self.output -end + softmaxAttn = self.sm(attn) -function GlobalAttention:updateGradInput(input, gradOutput) - self.gradInput = self.net:updateGradInput(input, gradOutput) - return self.gradInput -end + softmaxAttn = softmaxAttn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL + contextCombined = torch.bmm(attn, context).squeeze(1) # batch x 1 x dim -function GlobalAttention:accGradParameters(input, gradOutput, scale) - return self.net:accGradParameters(input, gradOutput, scale) -end + contextOutput = self.tanh(self.linear_out(contextCombined)) diff --git a/OpenNMT/onmt/train/Optim.py b/OpenNMT/onmt/train/Optim.py index df1e1a2b1b..69bc1b16c9 100644 --- a/OpenNMT/onmt/train/Optim.py +++ b/OpenNMT/onmt/train/Optim.py @@ -18,7 +18,7 @@ def Optim.__init__(self, method, lr, lr_decay=1, start_decay_at=None): else: raise RuntimeError("Invalid optim method: " + self.method) - def Optim.prepareGrad(self, params, max_grad_norm): + def Optim.step(self, params, max_grad_norm): # Compute gradients norm. grad_norm = 0 for param in params: diff --git a/OpenNMT/onmt/utils/Parallel.py b/OpenNMT/onmt/utils/Parallel.py index 93eaf93bf2..15209f3045 100644 --- a/OpenNMT/onmt/utils/Parallel.py +++ b/OpenNMT/onmt/utils/Parallel.py @@ -19,7 +19,7 @@ def __init__(self, nthreads): def launch(self, label, closure, args=None, endcallback=None): if label is not None: - print("START",label) + print("START", label) for j in range(self.count): if self.nthreads == 0: diff --git a/OpenNMT/train.py b/OpenNMT/train.py index a0e1c25a18..78ba7472ef 100644 --- a/OpenNMT/train.py +++ b/OpenNMT/train.py @@ -4,6 +4,8 @@ import argparse import os import torch +import torch.nn as nn +from torch.autograd import Variable parser = argparse.ArgumentParser(description='train.lua') @@ -33,7 +35,7 @@ parser.add_argument('-input_feed', type=int, default=1, help="Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.") parser.add_argument('-residual', action="store_true", help="Add residual connections between RNN layers.") parser.add_argument('-brnn', action="store_true", help="Use a bidirectional encoder") -parser.add_argument('-brnn_merge', default='sum', help="Merge action for the bidirectional hidden states: concat or sum") +parser.add_argument('-brnn_merge', default='concat', help="Merge action for the bidirectional hidden states: concat or sum") ## ## **Optimization options** @@ -61,8 +63,8 @@ parser.add_argument('-pre_word_vecs_dec', help="""If a valid path is specified, then this will load pretrained word embeddings on the decoder side. See README for specific formatting instructions.""") -parser.add_argument('-fix_word_vecs_enc', action="store_true", help="Fix word embeddings on the encoder side") -parser.add_argument('-fix_word_vecs_dec', action="store_true", help="Fix word embeddings on the decoder side") +# parser.add_argument('-fix_word_vecs_enc', action="store_true", help="Fix word embeddings on the encoder side") +# parser.add_argument('-fix_word_vecs_dec', action="store_true", help="Fix word embeddings on the decoder side") ## ## **Other options** @@ -70,8 +72,8 @@ # GPU parser.add_argument('-gpuid', type=int, default=-1, help="Which gpu to use (1-indexed). < 1 = use CPU") -parser.add_argument('-nparallel', type=int, default=1, help="""When using GPUs, how many batches to execute in parallel. - Note. this will technically change the final batch size to max_batch_size*nparallel.""") +# parser.add_argument('-nparallel', type=int, default=1, help="""When using GPUs, how many batches to execute in parallel. +# Note. this will technically change the final batch size to max_batch_size*nparallel.""") parser.add_argument('-no_nccl', action="store_true", help="Disable usage of nccl in parallel mode.") parser.add_argument('-disable_mem_optimization', action="store_true", help="""Disable sharing internal of internal buffers between clones - which is in general safe, except if you want to look inside clones for visualization purpose for instance.""") @@ -85,104 +87,54 @@ opt = parser.parse_args() -# pool = onmt.utils.Parallel.ThreadPool(opt.nparallel) -def initParams(model, verbose): - numParams = 0 - params, gradParams = {}, {} +class NMTCriterion(nn.Container): + def __init__(vocabSize, features): + self.sub = [] + def makeOne(size): + weight = torch.ones(vocabSize) + weight[onmt.Constants.PAD] = 0 + return nn.NLLLoss(weight) - if verbose: - print('Initializing parameters...') + self.sub += [makeOne(vocabSize)] + for feature in features: + self.sub += [makeOne(features.size())] - for mod in model.values(): - p, gp = mod.getParameters() - - if opt.train_from.len() == 0: - p.uniform(-opt.param_init, opt.param_init) - - numParams = numParams + p.size(0) - params += [p] - gradParams += [gp] - - if verbose: - print(" * number of parameters. " + numParams) - - return params, gradParams - - -def buildCriterion(vocabSize, features): - criterion = nn.ParallelCriterion(False) - - def addNllCriterion(size): - # Ignores padding value. - w = torch.ones(size) - w[onmt.Constants.PAD] = 0 - - nll = nn.ClassNLLCriterion(w) - - # Let the training code manage loss normalization. - nll.sizeAverage = False - criterion.add(nll) - - addNllCriterion(vocabSize) - - for feature in features: - addNllCriterion(feature.size()) - - return criterion + def forward(self, inputs, targets): + assert(input.size(1) == len(self.sub)) + loss = Variable(inputs.new(1).zero_()) + for feat, target, sub in zip(inputs.split(1), targets.split(1), self.sub): + loss += sub(feat, target) + return loss def eval(model, criterion, data): loss = 0 total = 0 - model.encoder.evaluate() - model.decoder.evaluate() + model.evaluate() for i in range(data.batchCount()): batch = onmt.utils.Cuda.convert(data.getBatch(i)) - encoderStates, context = model.encoder.forward(batch) - loss = loss + model.decoder.computeLoss(batch, encoderStates, context, criterion) + outputs = model.forward() + loss = criterion.forward(outputs, batch.getTargetOutput()) total = total + batch.targetNonZeros - model.encoder.training() - model.decoder.training() + model.training() return math.exp(loss / total) def trainModel(model, trainData, validData, dataset, info): - params, gradParams = {}, {} - - def initParams(idx, args, state): - # Only logs information of the first thread. - verbose = idx == 0 and not opt.json_log - model = state['model'] - - params, gradParams = initParams(model, verbose) - for mod in model.values(): - mod.training() - - # define criterion of each GPU - state['criterion'] = onmt.utils.Cuda.convert(buildCriterion(dataset.dicts.tgt.words.size(), - dataset.dicts.tgt.features)) - # # optimize memory of the first clone - # if not opt.disable_mem_optimization: - # batch = onmt.utils.Cuda.convert(trainData.getBatch(1)) - # batch.totalSize = batch.size - # onmt.utils.Memory.optimize(model, criterion, batch, verbose) - - return idx, state['criterion'], params, gradParams - - def _endcallback(args): - idx, thecriterion, theparams, thegradParams = args - if idx == 0: - criterion = thecriterion - params[idx] = theparams - gradParams[idx] = thegradParams + for mod in model.values(): + mod.training() + for p in mod.parameters(): + p.uniform_(-opt.param_init, opt.param_init) - pool.launch(None, initParams, endcallback=_endcallback) + # define criterion of each GPU + criterion = onmt.utils.Cuda.convert(buildCriterion(dataset.dicts.tgt.words.size(), + dataset.dicts.tgt.features)) optim = onmt.train.Optim( opt.optim, opt.learning_rate, @@ -190,81 +142,48 @@ def _endcallback(args): start_decay_at=opt.start_decay_at ) - checkpoint = onmt.train.Checkpoint.new(opt, model, optim, dataset) + # checkpoint = onmt.train.Checkpoint.new(opt, model, optim, dataset) def trainEpoch(epoch, lastValidPpl): startI = opt.start_iteration - numIterations = math.ceil(trainData.batchCount() / pool.count) - if startI > 1 and info != None: - epochState = onmt.train.EpochState.new(epoch, numIterations, optim.getLearningRate(), lastValidPpl, info.epochStatus) - batchOrder = info.batchOrder - else: - epochState = onmt.train.EpochState.new(epoch, numIterations, optim.getLearningRate(), lastValidPpl) - # shuffle mini batch order - batchOrder = torch.randperm(trainData.batchCount()) + # shuffle mini batch order + batchOrder = torch.randperm(trainData.batchCount()) opt.start_iteration = 1 ii = 1 - def trainOne(idx, args, state): - - batch = args[idx] - if batch is None: - return idx, 0 - - # send batch data to GPU - onmt.utils.Cuda.convert(batch) - batch.totalSize = totalSize - - optim.zeroGrad(gradParams) - - encStates, context = model['encoder'].forward(batch) - decOutputs = model['decoder'].forward(batch, encStates, context) - - encGradStatesOut, gradContext, loss = model['decoder'].backward(batch, decOutputs, criterion) - model['encoder'].backward(batch, encGradStatesOut, gradContext) - return idx, loss - - for i in range(startI, trainData.batchCount(), onmt.utils.Parallel.count): - batches = {} + for i in range(startI, trainData.batchCount()): totalSize = 0 - for j in range(math.min(onmt.utils.Parallel.count, trainData.batchCount()-i+1)): - batchIdx = batchOrder[i+j-1] - if epoch <= opt.curriculum: - batchIdx = i+j-1 + batchIdx = batchOrder[i] + if epoch <= opt.curriculum: + batchIdx = i - table.insert(batches, trainData.getBatch(batchIdx)) - totalSize = totalSize + batches[-1].size + batch = trainData.getBatch(batchIdx) + totalSize += batch.size - losses = {} + batch.totalSize = totalSize - def _endcallback(idx, loss): - losses[idx] = loss + model.zero_grad() - pool.launch(None, trainOne, args=batches, endcallback=_endcallback) - # accumulate the gradients from the different parallel threads - XXX.accGradParams(gradParams, batches) + outputs = model.forward(batch) + loss = criterion.forward(outputs, batch.getTargetOutput()) + loss.backward() # update the parameters - optim.prepareGrad(gradParams[1], opt.max_grad_norm) - optim.updateParams(params[1], gradParams[1]) - - # sync the paramaters with the different parallel threads - XXXsyncParams(params) - - epochState.update(batches, losses) + optim.step(model.params(), opt.max_grad_norm) if ii % opt.report_every == 0: - epochState.log(ii, opt.json_log) + print("Done %d batches" % ii) + pass # FIXME - if opt.save_every > 0 and ii % opt.save_every == 0: - checkpoint.saveIteration(ii, epochState, batchOrder, not opt.json_log) + # if opt.save_every > 0 and ii % opt.save_every == 0: + # checkpoint.saveIteration(ii, epochState, batchOrder, not opt.json_log) - ii = ii + 1 + ii += 1 return epochState validPpl = 0 @@ -283,25 +202,7 @@ def _endcallback(idx, loss): if opt.optim == 'sgd': optim.updateLearningRate(validPpl, epoch) - checkpoint.saveEpoch(validPpl, epochState, not opt.json_log) - - -def buildModel(idx, args, state): - checkpoint = args - model = state['model'] = {} - - if checkpoint.models: - model['encoder'] = onmt.Models.loadEncoder(checkpoint.models.encoder, idx > 1) - model['decoder'] = onmt.Models.loadDecoder(checkpoint.models.decoder, idx > 1) - else: - verbose = idx == 1 and not opt.json_log - model['encoder'] = onmt.Models.buildEncoder(opt, dataset.dicts.src) - model['decoder'] = onmt.Models.buildDecoder(opt, dataset.dicts.tgt, verbose) - - for mod in model.values(): - onmt.utils.Cuda.convert(mod) - - return idx, model + # checkpoint.saveEpoch(validPpl, epochState, not opt.json_log) def main(): @@ -316,7 +217,6 @@ def main(): if not opt.json_log: print('Loading checkpoint \'' + opt.train_from + '\'...') - checkpoint = torch.load(opt.train_from) opt.layers = checkpoint.options.layers @@ -326,7 +226,7 @@ def main(): opt.input_feed = checkpoint.options.input_feed # Resume training from checkpoint - if opt.train_from is not None and opt.cont: + if opt.cont: opt.optim = checkpoint.options.optim opt.learning_rate_decay = checkpoint.options.learning_rate_decay opt.start_decay_at = checkpoint.options.start_decay_at @@ -362,36 +262,38 @@ def main(): (trainData.maxSourceLength, trainData.maxTargetLength)) print(' * number of training sentences. %d' % len(trainData.src)) print(' * maximum batch size. %d' % opt.max_batch_size * pool.count) - else: - metadata = dict( - options=opt, - vocabSize=dict( - source=dataset.dicts.src.words.size(), - target=dataset.dicts.tgt.words.size() - ), - additionalFeatures=dict( - source=len(dataset.dicts.src.features), - target=len(dataset.dicts.tgt.features) - ), - sequenceLength=dict( - source=trainData.maxSourceLength, - target=trainData.maxTargetLength - ), - trainingSentences = len(trainData.src) - ) - - onmt.utils.Log.logJson(metadata) - + # else: + # metadata = dict( + # options=opt, + # vocabSize=dict( + # source=dataset.dicts.src.words.size(), + # target=dataset.dicts.tgt.words.size() + # ), + # additionalFeatures=dict( + # source=len(dataset.dicts.src.features), + # target=len(dataset.dicts.tgt.features) + # ), + # sequenceLength=dict( + # source=trainData.maxSourceLength, + # target=trainData.maxTargetLength + # ), + # trainingSentences=len(trainData.src) + # ) + # + # onmt.utils.Log.logJson(metadata) if not opt.json_log: print('Building model...') - def _endcallback(idx, themodel): - if idx == 0: - model = themodel + model = {} + if checkpoint.models: + encoder = onmt.Models.loadEncoder(checkpoint.models.encoder, idx > 1) + decoder = onmt.Models.loadDecoder(checkpoint.models.decoder, idx > 1) + else: + encoder = onmt.Models.buildEncoder(opt, dataset.dicts.src) + decoder = onmt.Models.buildDecoder(opt, dataset.dicts.tgt, not opt.json_log) - onmt.utils.Parallel.launch(None, buildModel, args=checkpoint, - endcallback=_endcallback) + model = nn.Sequential(encoder, decoder) trainModel(model, trainData, validData, dataset, checkpoint.info)