Merge pull request karpathy#73 from guillitte/master

GRU and plain RNN support
KaiWeiChang · Aug 1, 2015 · 5a1793b · 5a1793b
2 parents ef0373f + e555fa9
commit 5a1793b
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 15 deletions.
diff --git a/model/GRU.lua b/model/GRU.lua
@@ -5,8 +5,8 @@ local GRU = {}
 Creates one timestep of one GRU
 Paper reference: http://arxiv.org/pdf/1412.3555v1.pdf
 ]]--
-function GRU.gru(input_size, rnn_size, n)
-
+function GRU.gru(input_size, rnn_size, n, dropout)
+  dropout = dropout or 0 
   -- there are n+1 inputs (hiddens on each layer and x)
   local inputs = {}
   table.insert(inputs, nn.Identity()()) -- x
@@ -25,9 +25,15 @@ function GRU.gru(input_size, rnn_size, n)
   for L = 1,n do
 
     local prev_h = inputs[L+1]
-    if L == 1 then x = inputs[1] else x = outputs[L-1] end
-    if L == 1 then input_size_L = input_size else input_size_L = rnn_size end
-
+    -- the input to this layer
+    if L == 1 then 
+      x = OneHot(input_size)(inputs[1])
+      input_size_L = input_size
+    else 
+      x = outputs[(L-1)] 
+      if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
+      input_size_L = rnn_size
+    end
     -- GRU tick
     -- forward the update and reset gates
     local update_gate = nn.Sigmoid()(new_input_sum(input_size_L, x, prev_h))
@@ -44,9 +50,14 @@ function GRU.gru(input_size, rnn_size, n)
 
     table.insert(outputs, next_h)
   end
+-- set up the decoder
+  local top_h = outputs[#outputs]
+  if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
+  local proj = nn.Linear(rnn_size, input_size)(top_h)
+  local logsoft = nn.LogSoftMax()(proj)
+  table.insert(outputs, logsoft)
 
   return nn.gModule(inputs, outputs)
 end
 
 return GRU
-
diff --git a/model/RNN.lua b/model/RNN.lua
@@ -1,21 +1,28 @@
 local RNN = {}
 
-function RNN.rnn(input_size, rnn_size, n)
+function RNN.rnn(input_size, rnn_size, n, dropout)
 
   -- there are n+1 inputs (hiddens on each layer and x)
   local inputs = {}
   table.insert(inputs, nn.Identity()()) -- x
   for L = 1,n do
     table.insert(inputs, nn.Identity()()) -- prev_h[L]
+
   end
 
   local x, input_size_L
   local outputs = {}
   for L = 1,n do
 
     local prev_h = inputs[L+1]
-    if L == 1 then x = inputs[1] else x = outputs[L-1] end
-    if L == 1 then input_size_L = input_size else input_size_L = rnn_size end
+    if L == 1 then 
+      x = OneHot(input_size)(inputs[1])
+      input_size_L = input_size
+    else 
+      x = outputs[(L-1)] 
+      if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
+      input_size_L = rnn_size
+    end
 
     -- RNN tick
     local i2h = nn.Linear(input_size_L, rnn_size)(x)
@@ -24,6 +31,12 @@ function RNN.rnn(input_size, rnn_size, n)
 
     table.insert(outputs, next_h)
   end
+-- set up the decoder
+  local top_h = outputs[#outputs]
+  if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
+  local proj = nn.Linear(rnn_size, input_size)(top_h)
+  local logsoft = nn.LogSoftMax()(proj)
+  table.insert(outputs, logsoft)
 
   return nn.gModule(inputs, outputs)
 end

diff --git a/sample.lua b/sample.lua
@@ -91,7 +91,7 @@ local ivocab = {}
 for c,i in pairs(vocab) do ivocab[i] = c end
 
 -- initialize the rnn state to all zeros
-gprint('creating an LSTM...')
+gprint('creating an ' .. checkpoint.opt.model .. '...')
 local current_state
 local num_layers = checkpoint.opt.num_layers
 current_state = {}
@@ -101,7 +101,9 @@ for L = 1,checkpoint.opt.num_layers do
     if opt.gpuid >= 0 and opt.opencl == 0 then h_init = h_init:cuda() end
     if opt.gpuid >= 0 and opt.opencl == 1 then h_init = h_init:cl() end
     table.insert(current_state, h_init:clone())
-    table.insert(current_state, h_init:clone())
+    if checkpoint.opt.model == 'lstm' then
+        table.insert(current_state, h_init:clone())
+    end
 end
 state_size = #current_state
 

diff --git a/train.lua b/train.lua
@@ -24,6 +24,8 @@ require 'util.misc'
 local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
+local GRU = require 'model.GRU'
+local RNN = require 'model.RNN'
 
 cmd = torch.CmdLine()
 cmd:text()
@@ -35,7 +37,7 @@ cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain th
 -- model params
 cmd:option('-rnn_size', 128, 'size of LSTM internal state')
 cmd:option('-num_layers', 2, 'number of layers in the LSTM')
-cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
+cmd:option('-model', 'lstm', 'lstm,gru or rnn')
 -- optimization
 cmd:option('-learning_rate',2e-3,'learning rate')
 cmd:option('-learning_rate_decay',0.97,'learning rate decay')
@@ -132,9 +134,15 @@ if string.len(opt.init_from) > 0 then
     opt.num_layers = checkpoint.opt.num_layers
     do_random_init = false
 else
-    print('creating an LSTM with ' .. opt.num_layers .. ' layers')
+    print('creating an ' .. opt.model .. ' with ' .. opt.num_layers .. ' layers')
     protos = {}
-    protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+    if opt.model == 'lstm' then
+        protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+    elseif opt.model == 'gru' then
+        protos.rnn = GRU.gru(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+    elseif opt.model == 'rnn' then
+        protos.rnn = RNN.rnn(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+    end
     protos.criterion = nn.ClassNLLCriterion()
 end
 
@@ -145,7 +153,9 @@ for L=1,opt.num_layers do
     if opt.gpuid >=0 and opt.opencl == 0 then h_init = h_init:cuda() end
     if opt.gpuid >=0 and opt.opencl == 1 then h_init = h_init:cl() end
     table.insert(init_state, h_init:clone())
-    table.insert(init_state, h_init:clone())
+    if opt.model == 'lstm' then
+        table.insert(init_state, h_init:clone())
+    end
 end
 
 -- ship the model to the GPU if desired