Skip to content

Commit

Permalink
Second GRU layer
Browse files Browse the repository at this point in the history
  • Loading branch information
dennybritz committed Oct 20, 2015
1 parent b85d542 commit 61563ac
Show file tree
Hide file tree
Showing 4 changed files with 310 additions and 161 deletions.
62 changes: 36 additions & 26 deletions gru_theano.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,46 @@

class GRUTheano:

def __init__(self, word_dim, hidden_dim=100, reg_lambda=0, wordvec=None):
def __init__(self, word_dim, hidden_dim=100, reg_lambda=0, wordvec=None, bptt_truncate=-1):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.reg_lambda = reg_lambda
self.bptt_truncate = bptt_truncate
# Initialize the network parameters
if wordvec != None:
U = np.array([wordvec.T, wordvec.T, wordvec.T])
else:
U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (3, hidden_dim, word_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (3, hidden_dim, hidden_dim))
b = np.zeros((3, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (9, hidden_dim, hidden_dim))
b = np.zeros((6, hidden_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
b2 = np.zeros(word_dim)

c = np.zeros(word_dim)
# Theano: Created shared variables
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
# Bias terms
self.b = theano.shared(name='b_i', value=b.astype(theano.config.floatX))
self.b2 = theano.shared(name='b_V', value=b2.astype(theano.config.floatX))
self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
# SGD: Initialize parameters
self.mU = theano.shared(name='mU', value=np.zeros(U.shape).astype(theano.config.floatX))
self.mV = theano.shared(name='mV', value=np.zeros(V.shape).astype(theano.config.floatX))
self.mW = theano.shared(name='mW', value=np.zeros(W.shape).astype(theano.config.floatX))
self.mb = theano.shared(name='mb', value=np.zeros(b.shape).astype(theano.config.floatX))
self.mb2 = theano.shared(name='mb2', value=np.zeros(b2.shape).astype(theano.config.floatX))
self.mc = theano.shared(name='mc', value=np.zeros(c.shape).astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()

def __theano_build__(self):
V, U, W, b, b2 = self.V, self.U, self.W, self.b, self.b2
# mV, mU, mW, mb, mb2 = self.mV, self.mU, self.mW, self.mb, self.mb2
V, U, W, b, c = self.V, self.U, self.W, self.b, self.c

x = T.ivector('x')
y = T.ivector('y')

def forward_prop_step(x_t, s_t_prev):
def forward_prop_step(x_t, s_t_prev, s_t2_prev):
# This is how we calculated the hidden state in a simple RNN. No longer!
# s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))

Expand All @@ -54,46 +55,53 @@ def forward_prop_step(x_t, s_t_prev):
U_clipped = grad_clip(U, -1, 1)
V_clipped = grad_clip(V, -1, 1)
b_clipped = grad_clip(b, -1, 1)
b2_clipped = grad_clip(b2, -1, 1)
c_clipped = grad_clip(c, -1, 1)

# LRU hidden state calculation
# Layer 1
z_t = T.nnet.sigmoid(U_clipped[0][:,x_t] + W_clipped[0].dot(s_t_prev) + b_clipped[0])
r_t = T.nnet.sigmoid(U_clipped[1][:,x_t] + W_clipped[1].dot(s_t_prev) + b_clipped[1])
c_t = T.tanh(U_clipped[2][:,x_t] + W_clipped[2].dot(s_t_prev) * r_t + b_clipped[2])
s_t = (1 - z_t) * c_t + z_t * s_t_prev

# Layer 2
z_t2 = T.nnet.sigmoid(W_clipped[3].dot(s_t) + W_clipped[6].dot(s_t2_prev) + b_clipped[3])
r_t2 = T.nnet.sigmoid(W_clipped[4].dot(s_t) + W_clipped[7].dot(s_t2_prev) + b_clipped[4])
c_t2 = T.tanh(W_clipped[5].dot(s_t) + W_clipped[8].dot(s_t2_prev) * r_t2 + b_clipped[5])
s_t2 = (1 - z_t2) * c_t2 + z_t2 * s_t2_prev

# Final output calculation
# Theano's softmax returns a matrix with one row, we only need the row
o_t = T.nnet.softmax(V_clipped.dot(s_t) + b2_clipped)[0]
o_t = T.nnet.softmax(V_clipped.dot(s_t2) + c_clipped)[0]

return [o_t, s_t]
return [o_t, s_t, s_t2]

[o,s], updates = theano.scan(
[o,s,s2], updates = theano.scan(
forward_prop_step,
sequences=x,
outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))])
truncate_gradient=self.bptt_truncate,
outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))])

prediction = T.argmax(o, axis=1)
o_error = T.sum(T.nnet.categorical_crossentropy(o, y))

# Regularization cost
reg_cost = self.reg_lambda/2. * \
(T.sum(T.sqr(V)) + T.sum(T.sqr(U)) + T.sum(T.sqr(W)) + T.sum(T.sqr(b)) + T.sum(T.sqr(b2)))
(T.sum(T.sqr(V)) + T.sum(T.sqr(U)) + T.sum(T.sqr(W)) + T.sum(T.sqr(b)) + T.sum(T.sqr(c)))
# Total cost
cost = o_error + reg_cost

# Gradients
dU = T.grad(o_error, U)
dW = T.grad(o_error, W)
db = T.grad(o_error, b)
dV = T.grad(o_error, V)
db2 = T.grad(o_error, b2)
dU = T.grad(cost, U)
dW = T.grad(cost, W)
db = T.grad(cost, b)
dV = T.grad(cost, V)
dc = T.grad(cost, c)

# Assign functions
self.forward_propagation = theano.function([x], o)
self.predict = theano.function([x], prediction)
self.ce_error = theano.function([x, y], cost)
self.bptt = theano.function([x, y], [dU, dW, db, dV, db2])
self.bptt = theano.function([x, y], [dU, dW, db, dV, dc])

# SGD parameters
learning_rate = T.scalar('learning_rate')
Expand All @@ -104,26 +112,28 @@ def forward_prop_step(x_t, s_t_prev):
mW = decay * self.mW + (1 - decay) * T.sqr(dW)
mV = decay * self.mV + (1 - decay) * T.sqr(dV)
mb = decay * self.mb + (1 - decay) * T.sqr(db)
mb2 = decay * self.mb2 + (1 - decay) * T.sqr(db2)
mc = decay * self.mc + (1 - decay) * T.sqr(dc)

self.sgd_step = theano.function(
[x, y, learning_rate, theano.Param(decay, default=0.9)],
[x, y, learning_rate, theano.Param(decay, default=0.99)],
[],
updates=[(U, U - learning_rate * dU / T.sqrt(mU + 1e-8)),
(W, W - learning_rate * dW / T.sqrt(mW + 1e-8)),
(V, V - learning_rate * dV / T.sqrt(mV + 1e-8)),
(b, b - learning_rate * db / T.sqrt(mb + 1e-8)),
(b2, b2 - learning_rate * db2 / T.sqrt(mb2 + 1e-8)),
(c, c - learning_rate * dc / T.sqrt(mc + 1e-8)),
(self.mU, mU),
(self.mW, mW),
(self.mV, mV),
(self.mb, mb),
(self.mb2, mb2)
(self.mc, mc)
])

def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])

def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)

Loading

0 comments on commit 61563ac

Please sign in to comment.