@@ -198,7 +198,7 @@ class GRU(Layer):
198
198
def __init__ (self , input_dim , output_dim = 128 ,
199
199
init = 'glorot_uniform' , inner_init = 'orthogonal' ,
200
200
activation = 'sigmoid' , inner_activation = 'hard_sigmoid' ,
201
- weights = None , truncate_gradient = - 1 , return_sequences = False ):
201
+ weights = None , truncate_gradient = - 1 , return_sequences = False , mask_val = default_mask_val ):
202
202
203
203
super (GRU ,self ).__init__ ()
204
204
self .input_dim = input_dim
@@ -211,6 +211,7 @@ def __init__(self, input_dim, output_dim=128,
211
211
self .activation = activations .get (activation )
212
212
self .inner_activation = activations .get (inner_activation )
213
213
self .input = T .tensor3 ()
214
+ self .mask_val = shared_scalar (default_mask_val )
214
215
215
216
self .W_z = self .init ((self .input_dim , self .output_dim ))
216
217
self .U_z = self .inner_init ((self .output_dim , self .output_dim ))
@@ -234,29 +235,35 @@ def __init__(self, input_dim, output_dim=128,
234
235
self .set_weights (weights )
235
236
236
237
def _step (self ,
237
- xz_t , xr_t , xh_t ,
238
+ xz_t , xr_t , xh_t , mask_tm1 ,
238
239
h_tm1 ,
239
240
u_z , u_r , u_h ):
240
- z = self .inner_activation (xz_t + T .dot (h_tm1 , u_z ))
241
- r = self .inner_activation (xr_t + T .dot (h_tm1 , u_r ))
242
- hh_t = self .activation (xh_t + T .dot (r * h_tm1 , u_h ))
243
- h_t = z * h_tm1 + (1 - z ) * hh_t
241
+ h_mask_tm1 = mask_tm1 * h_tm1
242
+ z = self .inner_activation (xz_t + T .dot (h_mask_tm1 , u_z ))
243
+ r = self .inner_activation (xr_t + T .dot (h_mask_tm1 , u_r ))
244
+ hh_t = self .activation (xh_t + T .dot (r * h_mask_tm1 , u_h ))
245
+ h_t = z * h_mask_tm1 + (1 - z ) * hh_t
246
+ #return theano.printing.Print("h_t")(h_t)
244
247
return h_t
245
248
246
249
def get_output (self , train ):
247
250
X = self .get_input (train )
248
251
X = X .dimshuffle ((1 ,0 ,2 ))
252
+ mask , padded_mask = get_mask (X , self .mask_val , steps_back = 1 )
249
253
250
254
x_z = T .dot (X , self .W_z ) + self .b_z
251
255
x_r = T .dot (X , self .W_r ) + self .b_r
252
256
x_h = T .dot (X , self .W_h ) + self .b_h
253
257
outputs , updates = theano .scan (
254
258
self ._step ,
255
- sequences = [x_z , x_r , x_h ],
259
+ sequences = [x_z , x_r , x_h , padded_mask ],
256
260
outputs_info = T .unbroadcast (alloc_zeros_matrix (X .shape [1 ], self .output_dim ), 1 ),
257
261
non_sequences = [self .U_z , self .U_r , self .U_h ],
258
262
truncate_gradient = self .truncate_gradient
259
263
)
264
+
265
+ outputs = mask * outputs + (1 - mask ) * self .mask_val
266
+
260
267
if self .return_sequences :
261
268
return outputs .dimshuffle ((1 ,0 ,2 ))
262
269
return outputs [- 1 ]
@@ -302,13 +309,14 @@ class LSTM(Layer):
302
309
def __init__ (self , input_dim , output_dim = 128 ,
303
310
init = 'glorot_uniform' , inner_init = 'orthogonal' ,
304
311
activation = 'tanh' , inner_activation = 'hard_sigmoid' ,
305
- weights = None , truncate_gradient = - 1 , return_sequences = False ):
312
+ weights = None , truncate_gradient = - 1 , return_sequences = False , mask_val = default_mask_val ):
306
313
307
314
super (LSTM ,self ).__init__ ()
308
315
self .input_dim = input_dim
309
316
self .output_dim = output_dim
310
317
self .truncate_gradient = truncate_gradient
311
318
self .return_sequences = return_sequences
319
+ self .mask_val = shared_scalar (mask_val )
312
320
313
321
self .init = initializations .get (init )
314
322
self .inner_init = initializations .get (inner_init )
@@ -343,19 +351,23 @@ def __init__(self, input_dim, output_dim=128,
343
351
self .set_weights (weights )
344
352
345
353
def _step (self ,
346
- xi_t , xf_t , xo_t , xc_t ,
354
+ xi_t , xf_t , xo_t , xc_t , mask_tm1 ,
347
355
h_tm1 , c_tm1 ,
348
356
u_i , u_f , u_o , u_c ):
349
- i_t = self .inner_activation (xi_t + T .dot (h_tm1 , u_i ))
350
- f_t = self .inner_activation (xf_t + T .dot (h_tm1 , u_f ))
351
- c_t = f_t * c_tm1 + i_t * self .activation (xc_t + T .dot (h_tm1 , u_c ))
352
- o_t = self .inner_activation (xo_t + T .dot (h_tm1 , u_o ))
357
+ h_mask_tm1 = mask_tm1 * h_tm1
358
+ c_mask_tm1 = mask_tm1 * c_tm1
359
+
360
+ i_t = self .inner_activation (xi_t + T .dot (h_mask_tm1 , u_i ))
361
+ f_t = self .inner_activation (xf_t + T .dot (h_mask_tm1 , u_f ))
362
+ c_t = f_t * c_mask_tm1 + i_t * self .activation (xc_t + T .dot (h_mask_tm1 , u_c ))
363
+ o_t = self .inner_activation (xo_t + T .dot (h_mask_tm1 , u_o ))
353
364
h_t = o_t * self .activation (c_t )
354
365
return h_t , c_t
355
366
356
367
def get_output (self , train ):
357
368
X = self .get_input (train )
358
369
X = X .dimshuffle ((1 ,0 ,2 ))
370
+ mask , padded_mask = get_mask (X , self .mask_val , steps_back = 1 )
359
371
360
372
xi = T .dot (X , self .W_i ) + self .b_i
361
373
xf = T .dot (X , self .W_f ) + self .b_f
@@ -364,14 +376,16 @@ def get_output(self, train):
364
376
365
377
[outputs , memories ], updates = theano .scan (
366
378
self ._step ,
367
- sequences = [xi , xf , xo , xc ],
379
+ sequences = [xi , xf , xo , xc , padded_mask ],
368
380
outputs_info = [
369
381
T .unbroadcast (alloc_zeros_matrix (X .shape [1 ], self .output_dim ), 1 ),
370
382
T .unbroadcast (alloc_zeros_matrix (X .shape [1 ], self .output_dim ), 1 )
371
383
],
372
384
non_sequences = [self .U_i , self .U_f , self .U_o , self .U_c ],
373
385
truncate_gradient = self .truncate_gradient
374
386
)
387
+
388
+ outputs = mask * outputs + (1 - mask ) * self .mask_val
375
389
if self .return_sequences :
376
390
return outputs .dimshuffle ((1 ,0 ,2 ))
377
391
return outputs [- 1 ]
0 commit comments