diff --git a/OpenNMT/.preprocess.lua.swp b/OpenNMT/.preprocess.lua.swp
deleted file mode 100644
index 037acd172e..0000000000
Binary files a/OpenNMT/.preprocess.lua.swp and /dev/null differ
diff --git a/OpenNMT/onmt/Models.py b/OpenNMT/onmt/Models.py
index 3b9a5da3b0..aa1a351ad4 100644
--- a/OpenNMT/onmt/Models.py
+++ b/OpenNMT/onmt/Models.py
@@ -1,136 +1,87 @@
-local function buildEncoder(opt, dicts)
-  local inputNetwork = onmt.WordEmbedding.new(dicts.words:size(), -- vocab size
-                                              opt.word_vec_size,
-                                              opt.pre_word_vecs_enc,
-                                              opt.fix_word_vecs_enc)
-
-  local inputSize = opt.word_vec_size
-
-  -- Sequences with features.
-  if #dicts.features > 0 then
-    local srcFeatEmbedding = onmt.FeaturesEmbedding.new(dicts.features,
-                                                        opt.feat_vec_exponent,
-                                                        opt.feat_vec_size,
-                                                        opt.feat_merge)
-
-    inputNetwork = nn.Sequential()
-      :add(nn.ParallelTable()
-             :add(inputNetwork)
-             :add(srcFeatEmbedding))
-      :add(nn.JoinTable(2))
-
-    inputSize = inputSize + srcFeatEmbedding.outputSize
-  end
-
-  if opt.brnn then
-    -- Compute rnn hidden size depending on hidden states merge action.
-    local rnnSize = opt.rnn_size
-    if opt.brnn_merge == 'concat' then
-      if opt.rnn_size % 2 ~= 0 then
-        error('in concat mode, rnn_size must be divisible by 2')
-      end
-      rnnSize = rnnSize / 2
-    elseif opt.brnn_merge == 'sum' then
-      rnnSize = rnnSize
-    else
-      error('invalid merge action ' .. opt.brnn_merge)
-    end
-
-    local rnn = onmt.LSTM.new(opt.layers, inputSize, rnnSize, opt.dropout, opt.residual)
-
-    return onmt.BiEncoder.new(inputNetwork, rnn, opt.brnn_merge)
-  else
-    local rnn = onmt.LSTM.new(opt.layers, inputSize, opt.rnn_size, opt.dropout, opt.residual)
-
-    return onmt.Encoder.new(inputNetwork, rnn)
-  end
-end
-
-local function buildDecoder(opt, dicts, verbose)
-  local inputNetwork = onmt.WordEmbedding.new(dicts.words:size(), -- vocab size
-                                              opt.word_vec_size,
-                                              opt.pre_word_vecs_dec,
-                                              opt.fix_word_vecs_dec)
-
-  local inputSize = opt.word_vec_size
-
-  local generator
-
-  -- Sequences with features.
-  if #dicts.features > 0 then
-    local tgtFeatEmbedding = onmt.FeaturesEmbedding.new(dicts.features,
-                                                        opt.feat_vec_exponent,
-                                                        opt.feat_vec_size,
-                                                        opt.feat_merge)
-
-    inputNetwork = nn.Sequential()
-      :add(nn.ParallelTable()
-             :add(inputNetwork)
-             :add(tgtFeatEmbedding))
-      :add(nn.JoinTable(2))
-
-    inputSize = inputSize + tgtFeatEmbedding.outputSize
-
-    generator = onmt.FeaturesGenerator.new(opt.rnn_size, dicts.words:size(), dicts.features)
-  else
-    generator = onmt.Generator.new(opt.rnn_size, dicts.words:size())
-  end
-
-  if opt.input_feed == 1 then
-    if verbose then
-      print(" * using input feeding")
-    end
-    inputSize = inputSize + opt.rnn_size
-  end
-
-  local rnn = onmt.LSTM.new(opt.layers, inputSize, opt.rnn_size, opt.dropout, opt.residual)
-
-  return onmt.Decoder.new(inputNetwork, rnn, generator, opt.input_feed == 1)
-end
-
---[[ This is useful when training from a model in parallel mode: each thread must own its model. ]]
-local function clonePretrained(model)
-  local clone = {}
-
-  for k, v in pairs(model) do
-    if k == 'modules' then
-      clone.modules = {}
-      for i = 1, #v do
-        table.insert(clone.modules, onmt.utils.Tensor.deepClone(v[i]))
-      end
-    else
-      clone[k] = v
-    end
-  end
-
-  return clone
-end
-
-local function loadEncoder(pretrained, clone)
-  local brnn = #pretrained.modules == 2
-
-  if clone then
-    pretrained = clonePretrained(pretrained)
-  end
-
-  if brnn then
-    return onmt.BiEncoder.load(pretrained)
-  else
-    return onmt.Encoder.load(pretrained)
-  end
-end
-
-local function loadDecoder(pretrained, clone)
-  if clone then
-    pretrained = clonePretrained(pretrained)
-  end
-
-  return onmt.Decoder.load(pretrained)
-end
-
-return {
-  buildEncoder = buildEncoder,
-  buildDecoder = buildDecoder,
-  loadEncoder = loadEncoder,
-  loadDecoder = loadDecoder
-}
+def _makeFeatEmbedder(opt, dicts):
+    return onmt.FeaturesEmbedding(dicts.features,
+                                  opt.feat_vec_exponent,
+                                  opt.feat_vec_size,
+                                  opt.feat_merge)
+
+
+class Encoder(nn.Container):
+
+    def __init__(self, opt, dicts):
+        input_size = opt.word_vec_size
+        feat_lut = None
+        # Sequences with features.
+        if len(dicts.features) > 0:
+            feat_lut = _makeFeatEmbedder(opt, dicts)
+            inputSize = inputSize + feat_lut.outputSize
+
+        super(Encoder, self).__init__(
+            word_lut=nn.LookupTable(dicts.words.size(), opt.word_vec_size)),
+            rnn=nn.LSTM(inputSize, opt.rnnSize,
+                        num_layers=opt.layers,
+                        dropout=opt.dropout,
+                        bidirectional=opt.brnn)
+        )
+
+        if opt.pre_word_vecs_enc is not None:
+            pretrained = torch.load(opt.pre_word_vecs_enc)
+            self.word_lut.weight.copy_(pretrained)
+
+        self.has_features = feat_lut is not None
+        if self.has_features:
+            self.add_module('feat_lut', feat_lut)
+
+    def forward(self, input, hidden):
+        if self.has_features:
+            word_emb = self.word_lut(input[0])
+            feat_emb = self.feat_lut(input[1])
+            emb = torch.cat([word_emb, feat_emb], 1)
+        else:
+            emb = self.word_lut(input)
+
+        outputs, next_hidden = self.rnn(input, hidden)
+        return outputs, next_hidden
+
+class Decoder(nn.Container):
+
+    def __init__(self, opt, dicts):
+        input_size = opt.word_vec_size
+        feat_lut = None
+        # Sequences with features.
+        if len(dicts.features) > 0:
+            feat_lut = _makeFeatEmbedder(opt, dicts)
+            inputSize = inputSize + feat_lut.outputSize
+
+        super(Decoder, self).__init__(
+            word_lut=nn.LookupTable(dicts.words.size(), opt.word_vec_size)),
+            rnn=nn.LSTM(inputSize, opt.rnnSize,
+                        num_layers=opt.layers,
+                        dropout=opt.dropout),
+            attn=GlobalAttention(opt.rnnSize),
+            dropout=nn.Dropout(opt.dropout)
+        )
+
+        if opt.pre_word_vecs_enc is not None:
+            pretrained = torch.load(opt.pre_word_vecs_dec)
+            self.word_lut.weight.copy_(pretrained)
+
+        self.has_features = feat_lut is not None
+        if self.has_features:
+            self.add_module('feat_lut', feat_lut)
+
+    def forward(self, input, hidden):
+        if self.has_features:
+            word_emb = self.word_lut(input[0])
+            feat_emb = self.feat_lut(input[1])
+            emb = torch.cat([word_emb, feat_emb], 1)
+        else:
+            emb = self.word_lut(input)
+
+        if self.input_feed:
+            emb = torch.cat([emb, input_feed], 1) # 1 step
+
+        outputs, next_hidden = self.rnn(input, hidden)
+
+        attn = self.attn(outputs, context) # FIXME: per timestep?
+        attn = self.dropout(attn)
+        return attn, next_hidden
diff --git a/OpenNMT/onmt/modules/Decoder.py b/OpenNMT/onmt/modules/Decoder.py
index ff73b826c7..7456e3fe9b 100644
--- a/OpenNMT/onmt/modules/Decoder.py
+++ b/OpenNMT/onmt/modules/Decoder.py
@@ -1,422 +1,422 @@
---[[ Unit to decode a sequence of output tokens.
-
-     .      .      .             .
-     |      |      |             |
-    h_1 => h_2 => h_3 => ... => h_n
-     |      |      |             |
-     .      .      .             .
-     |      |      |             |
-    h_1 => h_2 => h_3 => ... => h_n
-     |      |      |             |
-     |      |      |             |
-    x_1    x_2    x_3           x_n
+#[[ Unit to decode a sequence of output tokens.
+
+          .      .      .             .
+          |      |      |             |
+        h_1 => h_2 => h_3 => ... => h_n
+          |      |      |             |
+          .      .      .             .
+          |      |      |             |
+        h_1 => h_2 => h_3 => ... => h_n
+          |      |      |             |
+          |      |      |             |
+        x_1    x_2    x_3           x_n
 
 Inherits from [onmt.Sequencer](onmt+modules+Sequencer).
 
---]]
-local Decoder, parent = torch.class('onmt.Decoder', 'onmt.Sequencer')
+#]]
+Decoder, parent = torch.class('onmt.Decoder', 'onmt.Sequencer')
 
 
---[[ Construct a decoder layer.
+#[[ Construct a decoder layer.
 
-Parameters:
+Parameters.
 
-  * `inputNetwork` - input nn module.
-  * `rnn` - recurrent module, such as [onmt.LSTM](onmt+modules+LSTM).
-  * `generator` - optional, an output [onmt.Generator](onmt+modules+Generator).
-  * `inputFeed` - bool, enable input feeding.
---]]
-function Decoder:__init(inputNetwork, rnn, generator, inputFeed)
-  self.rnn = rnn
-  self.inputNet = inputNetwork
+    * `inputNetwork` - input nn module.
+    * `rnn` - recurrent module, such as [onmt.LSTM](onmt+modules+LSTM).
+    * `generator` - optional, an output [onmt.Generator](onmt+modules+Generator).
+    * `inputFeed` - bool, enable input feeding.
+#]]
+def Decoder.__init(inputNetwork, rnn, generator, inputFeed):
+    self.rnn = rnn
+    self.inputNet = inputNetwork
 
-  self.args = {}
-  self.args.rnnSize = self.rnn.outputSize
-  self.args.numEffectiveLayers = self.rnn.numEffectiveLayers
+    self.args = {}
+    self.args.rnnSize = self.rnn.outputSize
+    self.args.numEffectiveLayers = self.rnn.numEffectiveLayers
 
-  self.args.inputIndex = {}
-  self.args.outputIndex = {}
+    self.args.inputIndex = {}
+    self.args.outputIndex = {}
 
-  -- Input feeding means the decoder takes an extra
-  -- vector each time representing the attention at the
-  -- previous step.
-  self.args.inputFeed = inputFeed
+    # Input feeding means the decoder takes an extra
+    # vector each time representing the attention at the
+    # previous step.
+    self.args.inputFeed = inputFeed
 
-  parent.__init(self, self:_buildModel())
+    parent.__init(self, self._buildModel())
 
-  -- The generator use the output of the decoder sequencer to generate the
-  -- likelihoods over the target vocabulary.
-  self.generator = generator
-  self:add(self.generator)
+    # The generator use the output of the decoder sequencer to generate the
+    # likelihoods over the target vocabulary.
+    self.generator = generator
+    self.add(self.generator)
 
-  self:resetPreallocation()
-end
-
---[[ Return a new Decoder using the serialized data `pretrained`. ]]
-function Decoder.load(pretrained)
-  local self = torch.factory('onmt.Decoder')()
-
-  self.args = pretrained.args
-
-  parent.__init(self, pretrained.modules[1])
-  self.generator = pretrained.modules[2]
-  self:add(self.generator)
+    self.resetPreallocation()
 
-  self:resetPreallocation()
-
-  return self
-end
-
---[[ Return data to serialize. ]]
-function Decoder:serialize()
-  return {
-    modules = self.modules,
-    args = self.args
-  }
-end
-
-function Decoder:resetPreallocation()
-  if self.args.inputFeed then
-    self.inputFeedProto = torch.Tensor()
-  end
-
-  -- Prototype for preallocated hidden and cell states.
-  self.stateProto = torch.Tensor()
-
-  -- Prototype for preallocated output gradients.
-  self.gradOutputProto = torch.Tensor()
-
-  -- Prototype for preallocated context gradient.
-  self.gradContextProto = torch.Tensor()
-end
-
---[[ Build a default one time-step of the decoder
-
-Returns: An nn-graph mapping
-
-  $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t, con/H, if) =>
-  (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t}, a)}$$
-
-  Where ${c^l}$ and ${h^l}$ are the hidden and cell states at each layer,
-  ${x_t}$ is a sparse word to lookup,
-  ${con/H}$ is the context/source hidden states for attention,
-  ${if}$ is the input feeding, and
-  ${a}$ is the context vector computed at this timestep.
---]]
-function Decoder:_buildModel()
-  local inputs = {}
-  local states = {}
-
-  -- Inputs are previous layers first.
-  for _ = 1, self.args.numEffectiveLayers do
-    local h0 = nn.Identity()() -- batchSize x rnnSize
-    table.insert(inputs, h0)
-    table.insert(states, h0)
-  end
-
-  local x = nn.Identity()() -- batchSize
-  table.insert(inputs, x)
-  self.args.inputIndex.x = #inputs
-
-  local context = nn.Identity()() -- batchSize x sourceLength x rnnSize
-  table.insert(inputs, context)
-  self.args.inputIndex.context = #inputs
-
-  local inputFeed
-  if self.args.inputFeed then
-    inputFeed = nn.Identity()() -- batchSize x rnnSize
-    table.insert(inputs, inputFeed)
-    self.args.inputIndex.inputFeed = #inputs
-  end
-
-  -- Compute the input network.
-  local input = self.inputNet(x)
-
-  -- If set, concatenate previous decoder output.
-  if self.args.inputFeed then
-    input = nn.JoinTable(2)({input, inputFeed})
-  end
-  table.insert(states, input)
-
-  -- Forward states and input into the RNN.
-  local outputs = self.rnn(states)
-
-  -- The output of a subgraph is a node: split it to access the last RNN output.
-  outputs = { outputs:split(self.args.numEffectiveLayers) }
-
-  -- Compute the attention here using h^L as query.
-  local attnLayer = onmt.GlobalAttention(self.args.rnnSize)
-  attnLayer.name = 'decoderAttn'
-  local attnOutput = attnLayer({outputs[#outputs], context})
-  if self.rnn.dropout > 0 then
-    attnOutput = nn.Dropout(self.rnn.dropout)(attnOutput)
-  end
-  table.insert(outputs, attnOutput)
-  return nn.gModule(inputs, outputs)
-end
-
---[[ Mask padding means that the attention-layer is constrained to
-  give zero-weight to padding. This is done by storing a reference
-  to the softmax attention-layer.
-
-  Parameters:
-
-  * See  [onmt.MaskedSoftmax](onmt+modules+MaskedSoftmax).
---]]
-function Decoder:maskPadding(sourceSizes, sourceLength, beamSize)
-  if not self.decoderAttn then
-    self.network:apply(function (layer)
-      if layer.name == 'decoderAttn' then
-        self.decoderAttn = layer
-      end
-    end)
-  end
-
-  self.decoderAttn:replace(function(module)
-    if module.name == 'softmaxAttn' then
-      local mod
-      if sourceSizes ~= nil then
-        mod = onmt.MaskedSoftmax(sourceSizes, sourceLength, beamSize)
-      else
-        mod = nn.SoftMax()
-      end
-
-      mod.name = 'softmaxAttn'
-      mod:type(module._type)
-      self.softmaxAttn = mod
-      return mod
-    else
-      return module
-    end
-  end)
-end
-
---[[ Run one step of the decoder.
-
-Parameters:
-
-  * `input` - input to be passed to inputNetwork.
-  * `prevStates` - stack of hidden states (batch x layers*model x rnnSize)
-  * `context` - encoder output (batch x n x rnnSize)
-  * `prevOut` - previous distribution (batch x #words)
-  * `t` - current timestep
-
-Returns:
-
- 1. `out` - Top-layer hidden state.
- 2. `states` - All states.
---]]
-function Decoder:forwardOne(input, prevStates, context, prevOut, t)
-  local inputs = {}
-
-  -- Create RNN input (see sequencer.lua `buildNetwork('dec')`).
-  onmt.utils.Table.append(inputs, prevStates)
-  table.insert(inputs, input)
-  table.insert(inputs, context)
-  local inputSize
-  if torch.type(input) == 'table' then
-    inputSize = input[1]:size(1)
-  else
-    inputSize = input:size(1)
-  end
-
-  if self.args.inputFeed then
-    if prevOut == nil then
-      table.insert(inputs, onmt.utils.Tensor.reuseTensor(self.inputFeedProto,
-                                                         { inputSize, self.args.rnnSize }))
-    else
-      table.insert(inputs, prevOut)
-    end
-  end
-
-  -- Remember inputs for the backward pass.
-  if self.train then
-    self.inputs[t] = inputs
-  end
-
-  local outputs = self:net(t):forward(inputs)
-  local out = outputs[#outputs]
-  local states = {}
-  for i = 1, #outputs - 1 do
-    table.insert(states, outputs[i])
-  end
-
-  return out, states
-end
-
---[[Compute all forward steps.
-
-  Parameters:
-
-  * `batch` - `Batch` object
-  * `encoderStates` -
-  * `context` -
-  * `func` - Calls `func(out, t)` each timestep.
---]]
-
-function Decoder:forwardAndApply(batch, encoderStates, context, func)
-  -- TODO: Make this a private method.
-
-  if self.statesProto == nil then
-    self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                                         self.stateProto,
-                                                         { batch.size, self.args.rnnSize })
-  end
-
-  local states = onmt.utils.Tensor.copyTensorTable(self.statesProto, encoderStates)
-
-  local prevOut
-
-  for t = 1, batch.targetLength do
-    prevOut, states = self:forwardOne(batch:getTargetInput(t), states, context, prevOut, t)
-    func(prevOut, t)
-  end
-end
-
---[[Compute all forward steps.
-
-  Parameters:
-
-  * `batch` - a `Batch` object.
-  * `encoderStates` - a batch of initial decoder states (optional) [0]
-  * `context` - the context to apply attention to.
-
-  Returns: Table of top hidden state for each timestep.
---]]
-function Decoder:forward(batch, encoderStates, context)
-  encoderStates = encoderStates
-    or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                         onmt.utils.Cuda.convert(torch.Tensor()),
-                                         { batch.size, self.args.rnnSize })
-  if self.train then
-    self.inputs = {}
-  end
-
-  local outputs = {}
-
-  self:forwardAndApply(batch, encoderStates, context, function (out)
-    table.insert(outputs, out)
-  end)
-
-  return outputs
-end
-
---[[ Compute the backward update.
-
-Parameters:
-
-  * `batch` - a `Batch` object
-  * `outputs` - expected outputs
-  * `criterion` - a single target criterion object
-
-  Note: This code runs both the standard backward and criterion forward/backward.
-  It returns both the gradInputs and the loss.
-  -- ]]
-function Decoder:backward(batch, outputs, criterion)
-  if self.gradOutputsProto == nil then
-    self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers + 1,
-                                                              self.gradOutputProto,
-                                                              { batch.size, self.args.rnnSize })
-  end
-
-  local gradStatesInput = onmt.utils.Tensor.reuseTensorTable(self.gradOutputsProto,
-                                                             { batch.size, self.args.rnnSize })
-  local gradContextInput = onmt.utils.Tensor.reuseTensor(self.gradContextProto,
-                                                         { batch.size, batch.sourceLength, self.args.rnnSize })
-
-  local loss = 0
-
-  for t = batch.targetLength, 1, -1 do
-    -- Compute decoder output gradients.
-    -- Note: This would typically be in the forward pass.
-    local pred = self.generator:forward(outputs[t])
-    local output = batch:getTargetOutput(t)
-
-    loss = loss + criterion:forward(pred, output)
 
-    -- Compute the criterion gradient.
-    local genGradOut = criterion:backward(pred, output)
-    for j = 1, #genGradOut do
-      genGradOut[j]:div(batch.totalSize)
-    end
+#" Return a new Decoder using the serialized data `pretrained`. "
+def Decoder.load(pretrained):
+    self = torch.factory('onmt.Decoder')()
 
-    -- Compute the final layer gradient.
-    local decGradOut = self.generator:backward(outputs[t], genGradOut)
-    gradStatesInput[#gradStatesInput]:add(decGradOut)
+    self.args = pretrained.args
 
-    -- Compute the standarad backward.
-    local gradInput = self:net(t):backward(self.inputs[t], gradStatesInput)
-
-    -- Accumulate encoder output gradients.
-    gradContextInput:add(gradInput[self.args.inputIndex.context])
-    gradStatesInput[#gradStatesInput]:zero()
-
-    -- Accumulate previous output gradients with input feeding gradients.
-    if self.args.inputFeed and t > 1 then
-      gradStatesInput[#gradStatesInput]:add(gradInput[self.args.inputIndex.inputFeed])
-    end
-
-    -- Prepare next decoder output gradients.
-    for i = 1, #self.statesProto do
-      gradStatesInput[i]:copy(gradInput[i])
-    end
-  end
+    parent.__init(self, pretrained.modules[1])
+    self.generator = pretrained.modules[2]
+    self.add(self.generator)
 
-  return gradStatesInput, gradContextInput, loss
-end
-
---[[ Compute the loss on a batch.
-
-Parameters:
-
-  * `batch` - a `Batch` to score.
-  * `encoderStates` - initialization of decoder.
-  * `context` - the attention context.
-  * `criterion` - a pointwise criterion.
-
---]]
-function Decoder:computeLoss(batch, encoderStates, context, criterion)
-  encoderStates = encoderStates
-    or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                         onmt.utils.Cuda.convert(torch.Tensor()),
-                                         { batch.size, self.args.rnnSize })
-
-  local loss = 0
-  self:forwardAndApply(batch, encoderStates, context, function (out, t)
-    local pred = self.generator:forward(out)
-    local output = batch:getTargetOutput(t)
-    loss = loss + criterion:forward(pred, output)
-  end)
-
-  return loss
-end
+    self.resetPreallocation()
 
+    return self
 
---[[ Compute the score of a batch.
 
-Parameters:
-
-  * `batch` - a `Batch` to score.
-  * `encoderStates` - initialization of decoder.
-  * `context` - the attention context.
-
---]]
-function Decoder:computeScore(batch, encoderStates, context)
-  encoderStates = encoderStates
-    or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                         onmt.utils.Cuda.convert(torch.Tensor()),
-                                         { batch.size, self.args.rnnSize })
-
-  local score = {}
-
-  self:forwardAndApply(batch, encoderStates, context, function (out, t)
-    local pred = self.generator:forward(out)
-    for b = 1, batch.size do
-      if t <= batch.targetSize[b] then
-        score[b] = (score[b] or 0) + pred[1][b][batch.targetOutput[t][b]]
-      end
-    end
-  end)
+#" Return data to serialize. "
+def Decoder.serialize():
+    return {
+        modules = self.modules,
+        args = self.args
+    }
+
+
+def Decoder.resetPreallocation():
+    if self.args.inputFeed:
+        self.inputFeedProto = torch.Tensor()
+    
+
+    # Prototype for preallocated hidden and cell states.
+    self.stateProto = torch.Tensor()
+
+    # Prototype for preallocated output gradients.
+    self.gradOutputProto = torch.Tensor()
+
+    # Prototype for preallocated context gradient.
+    self.gradContextProto = torch.Tensor()
+
+
+#[[ Build a default one time-step of the decoder
+
+Returns. An nn-graph mapping
+
+    $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t, con/H, if) =>
+    (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t}, a)}$$
+
+    Where ${c^l}$ and ${h^l}$ are the hidden and cell states at each layer,
+    ${x_t}$ is a sparse word to lookup,
+    ${con/H}$ is the context/source hidden states for attention,
+    ${if}$ is the input feeding, and
+    ${a}$ is the context vector computed at this timestep.
+#]]
+def Decoder._buildModel():
+    inputs = {}
+    states = {}
+
+    # Inputs are previous layers first.
+    for _ = 1, self.args.numEffectiveLayers:
+        h0 = nn.Identity()() # batchSize x rnnSize
+        table.insert(inputs, h0)
+        table.insert(states, h0)
+    
+
+    x = nn.Identity()() # batchSize
+    table.insert(inputs, x)
+    self.args.inputIndex.x = len(inputs)
+
+    context = nn.Identity()() # batchSize x sourceLength x rnnSize
+    table.insert(inputs, context)
+    self.args.inputIndex.context = len(inputs)
+
+    inputFeed
+    if self.args.inputFeed:
+        inputFeed = nn.Identity()() # batchSize x rnnSize
+        table.insert(inputs, inputFeed)
+        self.args.inputIndex.inputFeed = len(inputs)
+    
+
+    # Compute the input network.
+    input = self.inputNet(x)
+
+    # If set, concatenate previous decoder output.
+    if self.args.inputFeed:
+        input = nn.JoinTable(2)({input, inputFeed})
+    
+    table.insert(states, input)
+
+    # Forward states and input into the RNN.
+    outputs = self.rnn(states)
+
+    # The output of a subgraph is a node. split it to access the last RNN output.
+    outputs = { outputs.split(self.args.numEffectiveLayers) }
+
+    # Compute the attention here using h^L as query.
+    attnLayer = onmt.GlobalAttention(self.args.rnnSize)
+    attnLayer.name = 'decoderAttn'
+    attnOutput = attnLayer({outputs[len(outputs]), context})
+    if self.rnn.dropout > 0:
+        attnOutput = nn.Dropout(self.rnn.dropout)(attnOutput)
+    
+    table.insert(outputs, attnOutput)
+    return nn.gModule(inputs, outputs)
+
+
+#[[ Mask padding means that the attention-layer is constrained to
+    give zero-weight to padding. This is done by storing a reference
+    to the softmax attention-layer.
+
+    Parameters.
+
+    * See  [onmt.MaskedSoftmax](onmt+modules+MaskedSoftmax).
+#]]
+def Decoder.maskPadding(sourceSizes, sourceLength, beamSize):
+    if not self.decoderAttn:
+        self.network.apply(def (layer):
+            if layer.name == 'decoderAttn':
+                self.decoderAttn = layer
+            
+        )
+    
+
+    self.decoderAttn.replace(function(module)
+        if module.name == 'softmaxAttn':
+            mod
+            if sourceSizes != None:
+                mod = onmt.MaskedSoftmax(sourceSizes, sourceLength, beamSize)
+            else:
+                mod = nn.SoftMax()
+            
+
+            mod.name = 'softmaxAttn'
+            mod.type(module._type)
+            self.softmaxAttn = mod
+            return mod
+        else:
+            return module
+        
+    )
+
+
+#[[ Run one step of the decoder.
+
+Parameters.
+
+    * `input` - input to be passed to inputNetwork.
+    * `prevStates` - stack of hidden states (batch x layers*model x rnnSize)
+    * `context` - encoder output (batch x n x rnnSize)
+    * `prevOut` - previous distribution (batch x len(words))
+    * `t` - current timestep
+
+Returns.
+
+  1. `out` - Top-layer hidden state.
+  2. `states` - All states.
+#]]
+def Decoder.forwardOne(input, prevStates, context, prevOut, t):
+    inputs = {}
+
+    # Create RNN input (see sequencer.lua `buildNetwork('dec')`).
+    onmt.utils.Table.app(inputs, prevStates)
+    table.insert(inputs, input)
+    table.insert(inputs, context)
+    inputSize
+    if torch.type(input) == 'table':
+        inputSize = input[1].size(1)
+    else:
+        inputSize = input.size(1)
+    
+
+    if self.args.inputFeed:
+        if prevOut == None:
+            table.insert(inputs, onmt.utils.Tensor.reuseTensor(self.inputFeedProto,
+                                                                                                                  { inputSize, self.args.rnnSize }))
+        else:
+            table.insert(inputs, prevOut)
+        
+    
+
+    # Remember inputs for the backward pass.
+    if self.train:
+        self.inputs[t] = inputs
+    
+
+    outputs = self.net(t):forward(inputs)
+    out = outputs[len(outputs])
+    states = {}
+    for i = 1, len(outputs) - 1:
+        table.insert(states, outputs[i])
+    
+
+    return out, states
+
+
+#[[Compute all forward steps.
+
+    Parameters.
+
+    * `batch` - `Batch` object
+    * `encoderStates` -
+    * `context` -
+    * `func` - Calls `func(out, t)` each timestep.
+#]]
+
+def Decoder.forwardAndApply(batch, encoderStates, context, func):
+    # TODO. Make this a private method.
+
+    if self.statesProto == None:
+        self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                                                  self.stateProto,
+                                                                                                                  { batch.size, self.args.rnnSize })
+    
+
+    states = onmt.utils.Tensor.copyTensorTable(self.statesProto, encoderStates)
+
+    prevOut
+
+    for t = 1, batch.targetLength:
+        prevOut, states = self.forwardOne(batch:getTargetInput(t), states, context, prevOut, t)
+        func(prevOut, t)
+    
+
+
+#[[Compute all forward steps.
+
+    Parameters.
+
+    * `batch` - a `Batch` object.
+    * `encoderStates` - a batch of initial decoder states (optional) [0]
+    * `context` - the context to apply attention to.
+
+    Returns. Table of top hidden state for each timestep.
+#]]
+def Decoder.forward(batch, encoderStates, context):
+    encoderStates = encoderStates
+        or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                  onmt.utils.Cuda.convert(torch.Tensor()),
+                                                                                  { batch.size, self.args.rnnSize })
+    if self.train:
+        self.inputs = {}
+    
+
+    outputs = {}
+
+    self.forwardAndApply(batch, encoderStates, context, def (out):
+        table.insert(outputs, out)
+    )
+
+    return outputs
+
+
+#[[ Compute the backward update.
+
+Parameters.
+
+    * `batch` - a `Batch` object
+    * `outputs` - expected outputs
+    * `criterion` - a single target criterion object
+
+    Note. This code runs both the standard backward and criterion forward/backward.
+    It returns both the gradInputs and the loss.
+    # ]]
+def Decoder.backward(batch, outputs, criterion):
+    if self.gradOutputsProto == None:
+        self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers + 1,
+                                                                                                                            self.gradOutputProto,
+                                                                                                                            { batch.size, self.args.rnnSize })
+    
+
+    gradStatesInput = onmt.utils.Tensor.reuseTensorTable(self.gradOutputsProto,
+                                                                                                                          { batch.size, self.args.rnnSize })
+    gradContextInput = onmt.utils.Tensor.reuseTensor(self.gradContextProto,
+                                                                                                                  { batch.size, batch.sourceLength, self.args.rnnSize })
+
+    loss = 0
+
+    for t = batch.targetLength, 1, -1:
+        # Compute decoder output gradients.
+        # Note. This would typically be in the forward pass.
+        pred = self.generator.forward(outputs[t])
+        output = batch.getTargetOutput(t)
+
+        loss = loss + criterion.forward(pred, output)
+
+        # Compute the criterion gradient.
+        genGradOut = criterion.backward(pred, output)
+        for j = 1, len(genGradOut):
+            genGradOut[j].div(batch.totalSize)
+        
+
+        # Compute the final layer gradient.
+        decGradOut = self.generator.backward(outputs[t], genGradOut)
+        gradStatesInput[len(gradStatesInput].add(decGradOut))
+
+        # Compute the standarad backward.
+        gradInput = self.net(t):backward(self.inputs[t], gradStatesInput)
+
+        # Accumulate encoder output gradients.
+        gradContextInput.add(gradInput[self.args.inputIndex.context])
+        gradStatesInput[len(gradStatesInput].zero())
+
+        # Accumulate previous output gradients with input feeding gradients.
+        if self.args.inputFeed and t > 1:
+            gradStatesInput[len(gradStatesInput].add(gradInput[self.args.inputIndex.inputFeed]))
+        
+
+        # Prepare next decoder output gradients.
+        for i = 1, len(self.statesProto):
+            gradStatesInput[i].copy(gradInput[i])
+        
+    
+
+    return gradStatesInput, gradContextInput, loss
+
+
+#[[ Compute the loss on a batch.
+
+Parameters.
+
+    * `batch` - a `Batch` to score.
+    * `encoderStates` - initialization of decoder.
+    * `context` - the attention context.
+    * `criterion` - a pointwise criterion.
+
+#]]
+def Decoder.computeLoss(batch, encoderStates, context, criterion):
+    encoderStates = encoderStates
+        or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                  onmt.utils.Cuda.convert(torch.Tensor()),
+                                                                                  { batch.size, self.args.rnnSize })
+
+    loss = 0
+    self.forwardAndApply(batch, encoderStates, context, def (out, t):
+        pred = self.generator.forward(out)
+        output = batch.getTargetOutput(t)
+        loss = loss + criterion.forward(pred, output)
+    )
+
+    return loss
+
+
+
+#[[ Compute the score of a batch.
+
+Parameters.
+
+    * `batch` - a `Batch` to score.
+    * `encoderStates` - initialization of decoder.
+    * `context` - the attention context.
+
+#]]
+def Decoder.computeScore(batch, encoderStates, context):
+    encoderStates = encoderStates
+        or onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                  onmt.utils.Cuda.convert(torch.Tensor()),
+                                                                                  { batch.size, self.args.rnnSize })
+
+    score = {}
+
+    self.forwardAndApply(batch, encoderStates, context, def (out, t):
+        pred = self.generator.forward(out)
+        for b = 1, batch.size:
+            if t <= batch.targetSize[b]:
+                score[b] = (score[b] or 0) + pred[1][b][batch.targetOutput[t][b]]
+            
+        
+    )
+
+    return score
 
-  return score
-end
diff --git a/OpenNMT/onmt/modules/Encoder.py b/OpenNMT/onmt/modules/Encoder.py
index 85d8e32db5..0ade2de292 100644
--- a/OpenNMT/onmt/modules/Encoder.py
+++ b/OpenNMT/onmt/modules/Encoder.py
@@ -1,232 +1,232 @@
---[[ Encoder is a unidirectional Sequencer used for the source language.
+#[[ Encoder is a unidirectional Sequencer used for the source language.
 
-    h_1 => h_2 => h_3 => ... => h_n
-     |      |      |             |
-     .      .      .             .
-     |      |      |             |
-    h_1 => h_2 => h_3 => ... => h_n
-     |      |      |             |
-     |      |      |             |
-    x_1    x_2    x_3           x_n
+        h_1 => h_2 => h_3 => ... => h_n
+          |      |      |             |
+          .      .      .             .
+          |      |      |             |
+        h_1 => h_2 => h_3 => ... => h_n
+          |      |      |             |
+          |      |      |             |
+        x_1    x_2    x_3           x_n
 
 
 Inherits from [onmt.Sequencer](onmt+modules+Sequencer).
---]]
-local Encoder, parent = torch.class('onmt.Encoder', 'onmt.Sequencer')
+#]]
+Encoder, parent = torch.class('onmt.Encoder', 'onmt.Sequencer')
 
---[[ Construct an encoder layer.
+#[[ Construct an encoder layer.
 
-Parameters:
+Parameters.
 
-  * `inputNetwork` - input module.
-  * `rnn` - recurrent module.
+    * `inputNetwork` - input module.
+    * `rnn` - recurrent module.
 ]]
-function Encoder:__init(inputNetwork, rnn)
-  self.rnn = rnn
-  self.inputNet = inputNetwork
+def Encoder.__init(inputNetwork, rnn):
+    self.rnn = rnn
+    self.inputNet = inputNetwork
 
-  self.args = {}
-  self.args.rnnSize = self.rnn.outputSize
-  self.args.numEffectiveLayers = self.rnn.numEffectiveLayers
+    self.args = {}
+    self.args.rnnSize = self.rnn.outputSize
+    self.args.numEffectiveLayers = self.rnn.numEffectiveLayers
 
-  parent.__init(self, self:_buildModel())
+    parent.__init(self, self._buildModel())
 
-  self:resetPreallocation()
-end
+    self.resetPreallocation()
 
---[[ Return a new Encoder using the serialized data `pretrained`. ]]
-function Encoder.load(pretrained)
-  local self = torch.factory('onmt.Encoder')()
 
-  self.args = pretrained.args
-  parent.__init(self, pretrained.modules[1])
+#" Return a new Encoder using the serialized data `pretrained`. "
+def Encoder.load(pretrained):
+    self = torch.factory('onmt.Encoder')()
 
-  self:resetPreallocation()
+    self.args = pretrained.args
+    parent.__init(self, pretrained.modules[1])
 
-  return self
-end
+    self.resetPreallocation()
 
---[[ Return data to serialize. ]]
-function Encoder:serialize()
-  return {
-    modules = self.modules,
-    args = self.args
-  }
-end
+    return self
 
-function Encoder:resetPreallocation()
-  -- Prototype for preallocated hidden and cell states.
-  self.stateProto = torch.Tensor()
 
-  -- Prototype for preallocated output gradients.
-  self.gradOutputProto = torch.Tensor()
+#" Return data to serialize. "
+def Encoder.serialize():
+    return {
+        modules = self.modules,
+        args = self.args
+    }
+
+
+def Encoder.resetPreallocation():
+    # Prototype for preallocated hidden and cell states.
+    self.stateProto = torch.Tensor()
+
+    # Prototype for preallocated output gradients.
+    self.gradOutputProto = torch.Tensor()
+
+    # Prototype for preallocated context vector.
+    self.contextProto = torch.Tensor()
+
+
+def Encoder.maskPadding():
+    self.maskPad = True
+
+
+#[[ Build one time-step of an encoder
+
+Returns. An nn-graph mapping
+
+    $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t) =>
+    (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t})}$$
+
+    Where $$c^l$$ and $$h^l$$ are the hidden and cell states at each layer,
+    $$x_t$$ is a sparse word to lookup.
+#]]
+def Encoder._buildModel():
+    inputs = {}
+    states = {}
+
+    # Inputs are previous layers first.
+    for _ = 1, self.args.numEffectiveLayers:
+        h0 = nn.Identity()() # batchSize x rnnSize
+        table.insert(inputs, h0)
+        table.insert(states, h0)
+    
+
+    # Input word.
+    x = nn.Identity()() # batchSize
+    table.insert(inputs, x)
+
+    # Compute input network.
+    input = self.inputNet(x)
+    table.insert(states, input)
+
+    # Forward states and input into the RNN.
+    outputs = self.rnn(states)
+    return nn.gModule(inputs, { outputs })
+
+
+#[[Compute the context representation of an input.
+
+Parameters.
+
+    * `batch` - as defined in batch.lua.
+
+Returns.
+
+    1. - final hidden states
+    2. - context matrix H
+#]]
+def Encoder.forward(batch):
+
+    # TODO. Change `batch` to `input`.
+
+    finalStates
+    outputSize = self.args.rnnSize
+
+    if self.statesProto == None:
+        self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                                                  self.stateProto,
+                                                                                                                  { batch.size, outputSize })
+    
+
+    # Make initial states h_0.
+    states = onmt.utils.Tensor.reuseTensorTable(self.statesProto, { batch.size, outputSize })
+
+    # Preallocated output matrix.
+    context = onmt.utils.Tensor.reuseTensor(self.contextProto,
+                                                                                                { batch.size, batch.sourceLength, outputSize })
+
+    if self.maskPad and not batch.sourceInputPadLeft:
+        finalStates = onmt.utils.Tensor.recursiveClone(states)
+    
+    if self.train:
+        self.inputs = {}
+    
+
+    # Act like nn.Sequential and call each clone in a feed-forward
+    # fashion.
+    for t = 1, batch.sourceLength:
+
+        # Construct "inputs". Prev states come first then source.
+        inputs = {}
+        onmt.utils.Table.app(inputs, states)
+        table.insert(inputs, batch.getSourceInput(t))
+
+        if self.train:
+            # Remember inputs for the backward pass.
+            self.inputs[t] = inputs
+        
+        states = self.net(t):forward(inputs)
+
+        # Special case padding.
+        if self.maskPad:
+            for b = 1, batch.size:
+                if batch.sourceInputPadLeft and t <= batch.sourceLength - batch.sourceSize[b]:
+                    for j = 1, len(states):
+                        states[j][b].zero()
+                    
+                elif not batch.sourceInputPadLeft and t == batch.sourceSize[b]:
+                    for j = 1, len(states):
+                        finalStates[j][b].copy(states[j][b])
+                    
+                
+            
+        
+
+        # Copy output (h^L_t = states[len(states])) to context.
+        context[{{}, t}].copy(states[len(states]))
+    
+
+    if finalStates == None:
+        finalStates = states
+    
+
+    return finalStates, context
+
+
+#[[ Backward pass (only called during training)
+
+    Parameters.
+
+    * `batch` - must be same as for forward
+    * `gradStatesOutput` gradient of loss wrt last state
+    * `gradContextOutput` - gradient of loss wrt full context.
+
+    Returns. `gradInputs` of input network.
+#]]
+def Encoder.backward(batch, gradStatesOutput, gradContextOutput):
+    # TODO. change this to (input, gradOutput) as in nngraph.
+    outputSize = self.args.rnnSize
+    if self.gradOutputsProto == None:
+        self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
+                                                                                                                            self.gradOutputProto,
+                                                                                                                            { batch.size, outputSize })
+    
+
+    gradStatesInput = onmt.utils.Tensor.copyTensorTable(self.gradOutputsProto, gradStatesOutput)
+    gradInputs = {}
+
+    for t = batch.sourceLength, 1, -1:
+        # Add context gradients to last hidden states gradients.
+        gradStatesInput[len(gradStatesInput].add(gradContextOutput[{{}), t}])
+
+        gradInput = self.net(t):backward(self.inputs[t], gradStatesInput)
+
+        # Prepare next encoder output gradients.
+        for i = 1, len(gradStatesInput):
+            gradStatesInput[i].copy(gradInput[i])
+        
+
+        # Gather gradients of all user inputs.
+        gradInputs[t] = {}
+        for i = len(gradStatesInput) + 1, #gradInput:
+            table.insert(gradInputs[t], gradInput[i])
+        
+
+        if len(gradInputs[t]) == 1:
+            gradInputs[t] = gradInputs[t][1]
+        
+    
+    # TODO. make these names clearer.
+    # Useful if input came from another network.
+    return gradInputs
 
-  -- Prototype for preallocated context vector.
-  self.contextProto = torch.Tensor()
-end
-
-function Encoder:maskPadding()
-  self.maskPad = true
-end
-
---[[ Build one time-step of an encoder
-
-Returns: An nn-graph mapping
-
-  $${(c^1_{t-1}, h^1_{t-1}, .., c^L_{t-1}, h^L_{t-1}, x_t) =>
-  (c^1_{t}, h^1_{t}, .., c^L_{t}, h^L_{t})}$$
-
-  Where $$c^l$$ and $$h^l$$ are the hidden and cell states at each layer,
-  $$x_t$$ is a sparse word to lookup.
---]]
-function Encoder:_buildModel()
-  local inputs = {}
-  local states = {}
-
-  -- Inputs are previous layers first.
-  for _ = 1, self.args.numEffectiveLayers do
-    local h0 = nn.Identity()() -- batchSize x rnnSize
-    table.insert(inputs, h0)
-    table.insert(states, h0)
-  end
 
-  -- Input word.
-  local x = nn.Identity()() -- batchSize
-  table.insert(inputs, x)
-
-  -- Compute input network.
-  local input = self.inputNet(x)
-  table.insert(states, input)
-
-  -- Forward states and input into the RNN.
-  local outputs = self.rnn(states)
-  return nn.gModule(inputs, { outputs })
-end
-
---[[Compute the context representation of an input.
-
-Parameters:
-
-  * `batch` - as defined in batch.lua.
-
-Returns:
-
-  1. - final hidden states
-  2. - context matrix H
---]]
-function Encoder:forward(batch)
-
-  -- TODO: Change `batch` to `input`.
-
-  local finalStates
-  local outputSize = self.args.rnnSize
-
-  if self.statesProto == nil then
-    self.statesProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                                         self.stateProto,
-                                                         { batch.size, outputSize })
-  end
-
-  -- Make initial states h_0.
-  local states = onmt.utils.Tensor.reuseTensorTable(self.statesProto, { batch.size, outputSize })
-
-  -- Preallocated output matrix.
-  local context = onmt.utils.Tensor.reuseTensor(self.contextProto,
-                                                { batch.size, batch.sourceLength, outputSize })
-
-  if self.maskPad and not batch.sourceInputPadLeft then
-    finalStates = onmt.utils.Tensor.recursiveClone(states)
-  end
-  if self.train then
-    self.inputs = {}
-  end
-
-  -- Act like nn.Sequential and call each clone in a feed-forward
-  -- fashion.
-  for t = 1, batch.sourceLength do
-
-    -- Construct "inputs". Prev states come first then source.
-    local inputs = {}
-    onmt.utils.Table.append(inputs, states)
-    table.insert(inputs, batch:getSourceInput(t))
-
-    if self.train then
-      -- Remember inputs for the backward pass.
-      self.inputs[t] = inputs
-    end
-    states = self:net(t):forward(inputs)
-
-    -- Special case padding.
-    if self.maskPad then
-      for b = 1, batch.size do
-        if batch.sourceInputPadLeft and t <= batch.sourceLength - batch.sourceSize[b] then
-          for j = 1, #states do
-            states[j][b]:zero()
-          end
-        elseif not batch.sourceInputPadLeft and t == batch.sourceSize[b] then
-          for j = 1, #states do
-            finalStates[j][b]:copy(states[j][b])
-          end
-        end
-      end
-    end
-
-    -- Copy output (h^L_t = states[#states]) to context.
-    context[{{}, t}]:copy(states[#states])
-  end
-
-  if finalStates == nil then
-    finalStates = states
-  end
-
-  return finalStates, context
-end
-
---[[ Backward pass (only called during training)
-
-  Parameters:
-
-  * `batch` - must be same as for forward
-  * `gradStatesOutput` gradient of loss wrt last state
-  * `gradContextOutput` - gradient of loss wrt full context.
-
-  Returns: `gradInputs` of input network.
---]]
-function Encoder:backward(batch, gradStatesOutput, gradContextOutput)
-  -- TODO: change this to (input, gradOutput) as in nngraph.
-  local outputSize = self.args.rnnSize
-  if self.gradOutputsProto == nil then
-    self.gradOutputsProto = onmt.utils.Tensor.initTensorTable(self.args.numEffectiveLayers,
-                                                              self.gradOutputProto,
-                                                              { batch.size, outputSize })
-  end
-
-  local gradStatesInput = onmt.utils.Tensor.copyTensorTable(self.gradOutputsProto, gradStatesOutput)
-  local gradInputs = {}
-
-  for t = batch.sourceLength, 1, -1 do
-    -- Add context gradients to last hidden states gradients.
-    gradStatesInput[#gradStatesInput]:add(gradContextOutput[{{}, t}])
-
-    local gradInput = self:net(t):backward(self.inputs[t], gradStatesInput)
-
-    -- Prepare next encoder output gradients.
-    for i = 1, #gradStatesInput do
-      gradStatesInput[i]:copy(gradInput[i])
-    end
-
-    -- Gather gradients of all user inputs.
-    gradInputs[t] = {}
-    for i = #gradStatesInput + 1, #gradInput do
-      table.insert(gradInputs[t], gradInput[i])
-    end
-
-    if #gradInputs[t] == 1 then
-      gradInputs[t] = gradInputs[t][1]
-    end
-  end
-  -- TODO: make these names clearer.
-  -- Useful if input came from another network.
-  return gradInputs
-
-end
diff --git a/OpenNMT/onmt/modules/FeaturesEmbedding.py b/OpenNMT/onmt/modules/FeaturesEmbedding.py
index bf3922f524..5e3c4fa55a 100644
--- a/OpenNMT/onmt/modules/FeaturesEmbedding.py
+++ b/OpenNMT/onmt/modules/FeaturesEmbedding.py
@@ -1,63 +1,29 @@
---[[
-  A nngraph unit that maps features ids to embeddings. When using multiple
-  features this can be the concatenation or the sum of each individual embedding.
-]]
-local FeaturesEmbedding, parent = torch.class('onmt.FeaturesEmbedding', 'nn.Container')
-
-function FeaturesEmbedding:__init(dicts, dimExponent, dim, merge)
-  parent.__init(self)
-
-  self.net = self:_buildModel(dicts, dimExponent, dim, merge)
-  self:add(self.net)
-end
-
-function FeaturesEmbedding:_buildModel(dicts, dimExponent, dim, merge)
-  local inputs = {}
-  local output
-
-  if merge == 'sum' then
-    self.outputSize = dim
-  else
-    self.outputSize = 0
-  end
-
-  for i = 1, #dicts do
-    local feat = nn.Identity()() -- batchSize
-    table.insert(inputs, feat)
-
-    local vocabSize = dicts[i]:size()
-    local embSize
-
-    if merge == 'sum' then
-      embSize = self.outputSize
-    else
-      embSize = math.floor(vocabSize ^ dimExponent)
-      self.outputSize = self.outputSize + embSize
-    end
-
-    local emb = nn.LookupTable(vocabSize, embSize)(feat)
-
-    if not output then
-      output = emb
-    elseif merge == 'sum' then
-      output = nn.CAddTable()({output, emb})
-    else
-      output = nn.JoinTable(2)({output, emb})
-    end
-  end
-
-  return nn.gModule(inputs, {output})
-end
-
-function FeaturesEmbedding:updateOutput(input)
-  self.output = self.net:updateOutput(input)
-  return self.output
-end
-
-function FeaturesEmbedding:updateGradInput(input, gradOutput)
-  return self.net:updateGradInput(input, gradOutput)
-end
-
-function FeaturesEmbedding:accGradParameters(input, gradOutput, scale)
-  self.net:accGradParameters(input, gradOutput, scale)
-end
+class FeaturesEmbeddding(nn.Container):
+
+    def __init__(self, dicts, dimExponent, dim, merge):
+        super(FeaturesEmbedding, self).__init__():
+
+        self.merge = merge
+        self.luts = []
+        self.outputSize = dim if merge == 'sum' else 0
+        for i, dict in dicts.enumerate():
+            vocabSize = dict.size()
+            if merge == 'sum':
+                embSize = dim
+            else:
+                embSize = math.floor(math.pow(vocabSize, dimExponent))
+                self.outputSize += embSize
+
+            lut = nn.LookupTable(vocabSize, embSize)
+            self.luts += []
+            self.add_module('lut_%d' % i, lut)
+
+    def forward(self, input):
+        embs = []
+        for i in range(input.size(1)):
+            embs += [self.luts[i](input.select(1, i))]
+
+        if self.merge == 'sum':
+            return sum(embs)
+        else:
+            return torch.cat(embs, 1)
diff --git a/OpenNMT/onmt/modules/FeaturesGenerator.py b/OpenNMT/onmt/modules/FeaturesGenerator.py
index c76812ed43..387658180d 100644
--- a/OpenNMT/onmt/modules/FeaturesGenerator.py
+++ b/OpenNMT/onmt/modules/FeaturesGenerator.py
@@ -1,53 +1,38 @@
---[[ Feature decoder generator. Given RNN state, produce categorical distribution over
+#[[ Feature decoder generator. Given RNN state, produce categorical distribution over
 tokens and features.
 
-  Implements $$[softmax(W^1 h + b^1), softmax(W^2 h + b^2), ..., softmax(W^n h + b^n)] $$.
---]]
+    Implements $$[softmax(W^1 h + b^1), softmax(W^2 h + b^2), ..., softmax(W^n h + b^n)] $$.
+#]]
 
 
-local FeaturesGenerator, parent = torch.class('onmt.FeaturesGenerator', 'nn.Container')
+FeaturesGenerator, parent = torch.class('onmt.FeaturesGenerator', 'nn.Container')
 
---[[
-Parameters:
+#[[
+Parameters.
 
-  * `rnnSize` - Input rnn size.
-  * `outputSize` - Output size (number of tokens).
-  * `features` - table of feature sizes.
---]]
-function FeaturesGenerator:__init(rnnSize, outputSize, features)
-  parent.__init(self)
-  self.net = self:_buildGenerator(rnnSize, outputSize, features)
-  self:add(self.net)
-end
+    * `rnnSize` - Input rnn size.
+    * `outputSize` - Output size (number of tokens).
+    * `features` - table of feature sizes.
+#]]
+def FeaturesGenerator.__init(rnnSize, outputSize, features):
+    parent.__init(self)
+    self.net = self._buildGenerator(rnnSize, outputSize, features)
+    self.add(self.net)
 
-function FeaturesGenerator:_buildGenerator(rnnSize, outputSize, features)
-  local generator = nn.ConcatTable()
 
-  -- Add default generator.
-  generator:add(nn.Sequential()
-                  :add(onmt.Generator(rnnSize, outputSize))
-                  :add(nn.SelectTable(1)))
+def FeaturesGenerator._buildGenerator(rnnSize, outputSize, features):
+    generator = nn.ConcatTable()
 
-  -- Add a generator for each target feature.
-  for i = 1, #features do
-    generator:add(nn.Sequential()
-                    :add(nn.Linear(rnnSize, features[i]:size()))
-                    :add(nn.LogSoftMax()))
-  end
+    # Add default generator.
+    generator.add(nn.Sequential()
+                    .add(nn.Linear(rnnSize, outputSize))
+                    .add(nn.LogSoftMax())
+                    .add(nn.SelectTable(1)))
 
-  return generator
-end
+    # Add a generator for each target feature.
+    for i = 1, len(features):
+        generator.add(nn.Sequential()
+                        .add(nn.Linear(rnnSize, features[i]:size()))
+                        .add(nn.LogSoftMax()))
 
-function FeaturesGenerator:updateOutput(input)
-  self.output = self.net:updateOutput(input)
-  return self.output
-end
-
-function FeaturesGenerator:updateGradInput(input, gradOutput)
-  self.gradInput = self.net:updateGradInput(input, gradOutput)
-  return self.gradInput
-end
-
-function FeaturesGenerator:accGradParameters(input, gradOutput, scale)
-  self.net:accGradParameters(input, gradOutput, scale)
-end
+    return generator
diff --git a/OpenNMT/onmt/modules/GlobalAttention.py b/OpenNMT/onmt/modules/GlobalAttention.py
index c80b41535d..74442082fc 100644
--- a/OpenNMT/onmt/modules/GlobalAttention.py
+++ b/OpenNMT/onmt/modules/GlobalAttention.py
@@ -1,74 +1,50 @@
-require('nngraph')
-
---[[ Global attention takes a matrix and a query vector. It
+"""
+Global attention takes a matrix and a query vector. It
 then computes a parameterized convex combination of the matrix
 based on the input query.
 
 
-    H_1 H_2 H_3 ... H_n
-     q   q   q       q
-      |  |   |       |
-       \ |   |      /
-           .....
-         \   |  /
-             a
-
-Constructs a unit mapping:
-  $$(H_1 .. H_n, q) => (a)$$
-  Where H is of `batch x n x dim` and q is of `batch x dim`.
-
-  The full function is  $$\tanh(W_2 [(softmax((W_1 q + b_1) H) H), q] + b_2)$$.
-
---]]
-local GlobalAttention, parent = torch.class('onmt.GlobalAttention', 'nn.Container')
-
---[[A nn-style module computing attention.
+        H_1 H_2 H_3 ... H_n
+          q   q   q       q
+            |  |   |       |
+              \ |   |      /
+                      .....
+                  \   |  /
+                          a
 
-  Parameters:
+Constructs a unit mapping.
+    $$(H_1 + H_n, q) => (a)$$
+    Where H is of `batch x n x dim` and q is of `batch x dim`.
 
-  * `dim` - dimension of the context vectors.
---]]
-function GlobalAttention:__init(dim)
-  parent.__init(self)
-  self.net = self:_buildModel(dim)
-  self:add(self.net)
-end
+    The full def is  $$\tanh(W_2 [(softmax((W_1 q + b_1) H) H), q] + b_2)$$.:
 
-function GlobalAttention:_buildModel(dim)
-  local inputs = {}
-  table.insert(inputs, nn.Identity()())
-  table.insert(inputs, nn.Identity()())
+"""
 
-  local targetT = nn.Linear(dim, dim, false)(inputs[1]) -- batchL x dim
-  local context = inputs[2] -- batchL x sourceTimesteps x dim
+import torch
+import torch.nn as nn
 
-  -- Get attention.
-  local attn = nn.MM()({context, nn.Replicate(1,3)(targetT)}) -- batchL x sourceL x 1
-  attn = nn.Sum(3)(attn)
-  local softmaxAttn = nn.SoftMax()
-  softmaxAttn.name = 'softmaxAttn'
-  attn = softmaxAttn(attn)
-  attn = nn.Replicate(1,2)(attn) -- batchL x 1 x sourceL
+class GlobalAttention(nn.Container):
+    def __init__(self, dim):
+        super(GlobalAttention, self).__init__(
+            linear_in=nn.Linear(dim, dim, bias=False),
+            sm=nn.SoftMax(),
+            linear_out=nn.Linear(dim*2, dim, bias=False),
+            tanh=nn.Tanh(),
+        )
 
-  -- Apply attention to context.
-  local contextCombined = nn.MM()({attn, context}) -- batchL x 1 x dim
-  contextCombined = nn.Sum(2)(contextCombined) -- batchL x dim
-  contextCombined = nn.JoinTable(2)({contextCombined, inputs[1]}) -- batchL x dim*2
-  local contextOutput = nn.Tanh()(nn.Linear(dim*2, dim, false)(contextCombined))
+    def forward(self, input, context):
+        """
+        input: batch x dim
+        context: batch x sourceL x dim
+        """
+        targetT = self.linear_in(input).unsqueeze(2) # batch x dim x 1
 
-  return nn.gModule(inputs, {contextOutput})
-end
+        # Get attention
+        attn = torch.bmm(context, targetT).squeeze(2) # batch x sourceL
 
-function GlobalAttention:updateOutput(input)
-  self.output = self.net:updateOutput(input)
-  return self.output
-end
+        softmaxAttn = self.sm(attn)
 
-function GlobalAttention:updateGradInput(input, gradOutput)
-  self.gradInput = self.net:updateGradInput(input, gradOutput)
-  return self.gradInput
-end
+        softmaxAttn = softmaxAttn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL
+        contextCombined = torch.bmm(attn, context).squeeze(1) # batch x 1 x dim
 
-function GlobalAttention:accGradParameters(input, gradOutput, scale)
-  return self.net:accGradParameters(input, gradOutput, scale)
-end
+        contextOutput = self.tanh(self.linear_out(contextCombined))
diff --git a/OpenNMT/onmt/train/Optim.py b/OpenNMT/onmt/train/Optim.py
index df1e1a2b1b..69bc1b16c9 100644
--- a/OpenNMT/onmt/train/Optim.py
+++ b/OpenNMT/onmt/train/Optim.py
@@ -18,7 +18,7 @@ def Optim.__init__(self, method, lr, lr_decay=1, start_decay_at=None):
         else:
             raise RuntimeError("Invalid optim method: " + self.method)
 
-    def Optim.prepareGrad(self, params, max_grad_norm):
+    def Optim.step(self, params, max_grad_norm):
         # Compute gradients norm.
         grad_norm = 0
         for param in params:
diff --git a/OpenNMT/onmt/utils/Parallel.py b/OpenNMT/onmt/utils/Parallel.py
index 93eaf93bf2..15209f3045 100644
--- a/OpenNMT/onmt/utils/Parallel.py
+++ b/OpenNMT/onmt/utils/Parallel.py
@@ -19,7 +19,7 @@ def __init__(self, nthreads):
 
     def launch(self, label, closure, args=None, endcallback=None):
         if label is not None:
-            print("START",label)
+            print("START", label)
 
         for j in range(self.count):
             if self.nthreads == 0:
diff --git a/OpenNMT/train.py b/OpenNMT/train.py
index a0e1c25a18..78ba7472ef 100644
--- a/OpenNMT/train.py
+++ b/OpenNMT/train.py
@@ -4,6 +4,8 @@
 import argparse
 import os
 import torch
+import torch.nn as nn
+from torch.autograd import Variable
 
 parser = argparse.ArgumentParser(description='train.lua')
 
@@ -33,7 +35,7 @@
 parser.add_argument('-input_feed', type=int,    default=1,  help="Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.")
 parser.add_argument('-residual',   action="store_true",     help="Add residual connections between RNN layers.")
 parser.add_argument('-brnn',       action="store_true",     help="Use a bidirectional encoder")
-parser.add_argument('-brnn_merge', default='sum',           help="Merge action for the bidirectional hidden states: concat or sum")
+parser.add_argument('-brnn_merge', default='concat',        help="Merge action for the bidirectional hidden states: concat or sum")
 
 ##
 ## **Optimization options**
@@ -61,8 +63,8 @@
 parser.add_argument('-pre_word_vecs_dec', help="""If a valid path is specified, then this will load
                                      pretrained word embeddings on the decoder side.
                                      See README for specific formatting instructions.""")
-parser.add_argument('-fix_word_vecs_enc', action="store_true", help="Fix word embeddings on the encoder side")
-parser.add_argument('-fix_word_vecs_dec', action="store_true", help="Fix word embeddings on the decoder side")
+# parser.add_argument('-fix_word_vecs_enc', action="store_true", help="Fix word embeddings on the encoder side")
+# parser.add_argument('-fix_word_vecs_dec', action="store_true", help="Fix word embeddings on the decoder side")
 
 ##
 ## **Other options**
@@ -70,8 +72,8 @@
 
 # GPU
 parser.add_argument('-gpuid',     type=int, default=-1, help="Which gpu to use (1-indexed). < 1 = use CPU")
-parser.add_argument('-nparallel', type=int, default=1,  help="""When using GPUs, how many batches to execute in parallel.
-                            Note. this will technically change the final batch size to max_batch_size*nparallel.""")
+# parser.add_argument('-nparallel', type=int, default=1,  help="""When using GPUs, how many batches to execute in parallel.
+#                             Note. this will technically change the final batch size to max_batch_size*nparallel.""")
 parser.add_argument('-no_nccl', action="store_true", help="Disable usage of nccl in parallel mode.")
 parser.add_argument('-disable_mem_optimization', action="store_true", help="""Disable sharing internal of internal buffers between clones - which is in general safe,
                                                 except if you want to look inside clones for visualization purpose for instance.""")
@@ -85,104 +87,54 @@
 
 opt = parser.parse_args()
 
-# pool = onmt.utils.Parallel.ThreadPool(opt.nparallel)
 
-def initParams(model, verbose):
-    numParams = 0
-    params, gradParams = {}, {}
+class NMTCriterion(nn.Container):
+    def __init__(vocabSize, features):
+        self.sub = []
+        def makeOne(size):
+            weight = torch.ones(vocabSize)
+            weight[onmt.Constants.PAD] = 0
+            return nn.NLLLoss(weight)
 
-    if verbose:
-        print('Initializing parameters...')
+        self.sub += [makeOne(vocabSize)]
+        for feature in features:
+            self.sub += [makeOne(features.size())]
 
-    for mod in model.values():
-        p, gp = mod.getParameters()
-
-        if opt.train_from.len() == 0:
-            p.uniform(-opt.param_init, opt.param_init)
-
-        numParams = numParams + p.size(0)
-        params += [p]
-        gradParams += [gp]
-
-    if verbose:
-        print(" * number of parameters. " + numParams)
-
-    return params, gradParams
-
-
-def buildCriterion(vocabSize, features):
-    criterion = nn.ParallelCriterion(False)
-
-    def addNllCriterion(size):
-        # Ignores padding value.
-        w = torch.ones(size)
-        w[onmt.Constants.PAD] = 0
-
-        nll = nn.ClassNLLCriterion(w)
-
-        # Let the training code manage loss normalization.
-        nll.sizeAverage = False
-        criterion.add(nll)
-
-    addNllCriterion(vocabSize)
-
-    for feature in features:
-        addNllCriterion(feature.size())
-
-    return criterion
+    def forward(self, inputs, targets):
+        assert(input.size(1) == len(self.sub))
+        loss = Variable(inputs.new(1).zero_())
+        for feat, target, sub in zip(inputs.split(1), targets.split(1), self.sub):
+            loss += sub(feat, target)
+        return loss
 
 
 def eval(model, criterion, data):
     loss = 0
     total = 0
 
-    model.encoder.evaluate()
-    model.decoder.evaluate()
+    model.evaluate()
 
     for i in range(data.batchCount()):
         batch = onmt.utils.Cuda.convert(data.getBatch(i))
-        encoderStates, context = model.encoder.forward(batch)
-        loss = loss + model.decoder.computeLoss(batch, encoderStates, context, criterion)
+        outputs = model.forward()
+        loss = criterion.forward(outputs, batch.getTargetOutput())
         total = total + batch.targetNonZeros
 
-    model.encoder.training()
-    model.decoder.training()
+    model.training()
 
     return math.exp(loss / total)
 
 
 def trainModel(model, trainData, validData, dataset, info):
-    params, gradParams = {}, {}
-
-    def initParams(idx, args, state):
-        # Only logs information of the first thread.
-        verbose = idx == 0 and not opt.json_log
-        model = state['model']
-
-        params, gradParams = initParams(model, verbose)
-        for mod in model.values():
-            mod.training()
-
-        # define criterion of each GPU
-        state['criterion'] = onmt.utils.Cuda.convert(buildCriterion(dataset.dicts.tgt.words.size(),
-                                                                    dataset.dicts.tgt.features))
 
-        # # optimize memory of the first clone
-        # if not opt.disable_mem_optimization:
-        #     batch = onmt.utils.Cuda.convert(trainData.getBatch(1))
-        #     batch.totalSize = batch.size
-        #     onmt.utils.Memory.optimize(model, criterion, batch, verbose)
-
-        return idx, state['criterion'], params, gradParams
-
-    def _endcallback(args):
-        idx, thecriterion, theparams, thegradParams = args
-        if idx == 0:
-            criterion = thecriterion
-        params[idx] = theparams
-        gradParams[idx] = thegradParams
+    for mod in model.values():
+        mod.training()
+        for p in mod.parameters():
+            p.uniform_(-opt.param_init, opt.param_init)
 
-    pool.launch(None, initParams, endcallback=_endcallback)
+    # define criterion of each GPU
+    criterion = onmt.utils.Cuda.convert(buildCriterion(dataset.dicts.tgt.words.size(),
+                                                       dataset.dicts.tgt.features))
 
     optim = onmt.train.Optim(
         opt.optim, opt.learning_rate,
@@ -190,81 +142,48 @@ def _endcallback(args):
         start_decay_at=opt.start_decay_at
     )
 
-    checkpoint = onmt.train.Checkpoint.new(opt, model, optim, dataset)
+    # checkpoint = onmt.train.Checkpoint.new(opt, model, optim, dataset)
 
     def trainEpoch(epoch, lastValidPpl):
 
         startI = opt.start_iteration
-        numIterations = math.ceil(trainData.batchCount() / pool.count)
 
-        if startI > 1 and info != None:
-            epochState = onmt.train.EpochState.new(epoch, numIterations, optim.getLearningRate(), lastValidPpl, info.epochStatus)
-            batchOrder = info.batchOrder
-        else:
-            epochState = onmt.train.EpochState.new(epoch, numIterations, optim.getLearningRate(), lastValidPpl)
-            # shuffle mini batch order
-            batchOrder = torch.randperm(trainData.batchCount())
+        # shuffle mini batch order
+        batchOrder = torch.randperm(trainData.batchCount())
 
         opt.start_iteration = 1
         ii = 1
 
-        def trainOne(idx, args, state):
-
-            batch = args[idx]
-            if batch is None:
-                return idx, 0
-
-            # send batch data to GPU
-            onmt.utils.Cuda.convert(batch)
-            batch.totalSize = totalSize
-
-            optim.zeroGrad(gradParams)
-
-            encStates, context = model['encoder'].forward(batch)
-            decOutputs = model['decoder'].forward(batch, encStates, context)
-
-            encGradStatesOut, gradContext, loss = model['decoder'].backward(batch, decOutputs, criterion)
-            model['encoder'].backward(batch, encGradStatesOut, gradContext)
-            return idx, loss
-
-        for i in range(startI, trainData.batchCount(), onmt.utils.Parallel.count):
-            batches = {}
+        for i in range(startI, trainData.batchCount()):
             totalSize = 0
 
-            for j in range(math.min(onmt.utils.Parallel.count, trainData.batchCount()-i+1)):
-                batchIdx = batchOrder[i+j-1]
-                if epoch <= opt.curriculum:
-                    batchIdx = i+j-1
+            batchIdx = batchOrder[i]
+            if epoch <= opt.curriculum:
+                batchIdx = i
 
-                table.insert(batches, trainData.getBatch(batchIdx))
-                totalSize = totalSize + batches[-1].size
+            batch = trainData.getBatch(batchIdx)
+            totalSize += batch.size
 
-            losses = {}
+            batch.totalSize = totalSize
 
-            def _endcallback(idx, loss):
-                losses[idx] = loss
+            model.zero_grad()
 
-            pool.launch(None, trainOne, args=batches, endcallback=_endcallback)
 
-            # accumulate the gradients from the different parallel threads
-            XXX.accGradParams(gradParams, batches)
+            outputs = model.forward(batch)
+            loss = criterion.forward(outputs, batch.getTargetOutput())
+            loss.backward()
 
             # update the parameters
-            optim.prepareGrad(gradParams[1], opt.max_grad_norm)
-            optim.updateParams(params[1], gradParams[1])
-
-            # sync the paramaters with the different parallel threads
-            XXXsyncParams(params)
-
-            epochState.update(batches, losses)
+            optim.step(model.params(), opt.max_grad_norm)
 
             if ii % opt.report_every == 0:
-                epochState.log(ii, opt.json_log)
+                print("Done %d batches" % ii)
+                pass # FIXME
 
-            if opt.save_every > 0 and ii % opt.save_every == 0:
-                checkpoint.saveIteration(ii, epochState, batchOrder, not opt.json_log)
+            # if opt.save_every > 0 and ii % opt.save_every == 0:
+            #     checkpoint.saveIteration(ii, epochState, batchOrder, not opt.json_log)
 
-            ii = ii + 1
+            ii += 1
         return epochState
 
     validPpl = 0
@@ -283,25 +202,7 @@ def _endcallback(idx, loss):
         if opt.optim == 'sgd':
             optim.updateLearningRate(validPpl, epoch)
 
-        checkpoint.saveEpoch(validPpl, epochState, not opt.json_log)
-
-
-def buildModel(idx, args, state):
-    checkpoint = args
-    model = state['model'] = {}
-
-    if checkpoint.models:
-        model['encoder'] = onmt.Models.loadEncoder(checkpoint.models.encoder, idx > 1)
-        model['decoder'] = onmt.Models.loadDecoder(checkpoint.models.decoder, idx > 1)
-    else:
-        verbose = idx == 1 and not opt.json_log
-        model['encoder'] = onmt.Models.buildEncoder(opt, dataset.dicts.src)
-        model['decoder'] = onmt.Models.buildDecoder(opt, dataset.dicts.tgt, verbose)
-
-    for mod in model.values():
-        onmt.utils.Cuda.convert(mod)
-
-    return idx, model
+        # checkpoint.saveEpoch(validPpl, epochState, not opt.json_log)
 
 
 def main():
@@ -316,7 +217,6 @@ def main():
         if not opt.json_log:
           print('Loading checkpoint \'' + opt.train_from + '\'...')
 
-
         checkpoint = torch.load(opt.train_from)
 
         opt.layers = checkpoint.options.layers
@@ -326,7 +226,7 @@ def main():
         opt.input_feed = checkpoint.options.input_feed
 
         # Resume training from checkpoint
-        if opt.train_from is not None and opt.cont:
+        if opt.cont:
             opt.optim = checkpoint.options.optim
             opt.learning_rate_decay = checkpoint.options.learning_rate_decay
             opt.start_decay_at = checkpoint.options.start_decay_at
@@ -362,36 +262,38 @@ def main():
                             (trainData.maxSourceLength, trainData.maxTargetLength))
         print(' * number of training sentences. %d' % len(trainData.src))
         print(' * maximum batch size. %d' % opt.max_batch_size * pool.count)
-    else:
-        metadata = dict(
-            options=opt,
-            vocabSize=dict(
-                source=dataset.dicts.src.words.size(),
-                target=dataset.dicts.tgt.words.size()
-            ),
-            additionalFeatures=dict(
-                source=len(dataset.dicts.src.features),
-                target=len(dataset.dicts.tgt.features)
-            ),
-            sequenceLength=dict(
-                source=trainData.maxSourceLength,
-                target=trainData.maxTargetLength
-            ),
-            trainingSentences = len(trainData.src)
-        )
-
-        onmt.utils.Log.logJson(metadata)
-
+    # else:
+    #     metadata = dict(
+    #         options=opt,
+    #         vocabSize=dict(
+    #             source=dataset.dicts.src.words.size(),
+    #             target=dataset.dicts.tgt.words.size()
+    #         ),
+    #         additionalFeatures=dict(
+    #             source=len(dataset.dicts.src.features),
+    #             target=len(dataset.dicts.tgt.features)
+    #         ),
+    #         sequenceLength=dict(
+    #             source=trainData.maxSourceLength,
+    #             target=trainData.maxTargetLength
+    #         ),
+    #         trainingSentences=len(trainData.src)
+    #     )
+    #
+    #     onmt.utils.Log.logJson(metadata)
 
     if not opt.json_log:
         print('Building model...')
 
-    def _endcallback(idx, themodel):
-        if idx == 0:
-            model = themodel
+    model = {}
+    if checkpoint.models:
+        encoder = onmt.Models.loadEncoder(checkpoint.models.encoder, idx > 1)
+        decoder = onmt.Models.loadDecoder(checkpoint.models.decoder, idx > 1)
+    else:
+        encoder = onmt.Models.buildEncoder(opt, dataset.dicts.src)
+        decoder = onmt.Models.buildDecoder(opt, dataset.dicts.tgt, not opt.json_log)
 
-    onmt.utils.Parallel.launch(None, buildModel, args=checkpoint,
-            endcallback=_endcallback)
+    model = nn.Sequential(encoder, decoder)
 
     trainModel(model, trainData, validData, dataset, checkpoint.info)