Merge pull request #2 from allenai/master

Merge from upstream
alvinhom · Apr 27, 2017 · 72b634b · 72b634b
2 parents c78d397 + 00d3630
commit 72b634b
Show file tree

Hide file tree

Showing 13 changed files with 155 additions and 69 deletions.
diff --git a/deep_qa/layers/attention/masked_softmax.py b/deep_qa/layers/attention/masked_softmax.py
@@ -48,12 +48,14 @@ def call(self, inputs, mask=None):
             inputs = K.squeeze(inputs, axis=-1)
             input_shape = input_shape[:-1]
         if len(input_shape) > 2:
+            original_inputs = inputs
             inputs = last_dim_flatten(inputs)
             if mask is not None:
                 mask = last_dim_flatten(mask)
         # Now we have both inputs and mask with shape (?, num_options), and can do a softmax.
         softmax_result = masked_softmax(inputs, mask)
         if len(input_shape) > 2:
-            input_shape = (-1,) + input_shape[1:]
+            original_shape = K.shape(original_inputs)
+            input_shape = K.concatenate([[-1], original_shape[1:]], 0)
             softmax_result = K.reshape(softmax_result, input_shape)
         return softmax_result
diff --git a/deep_qa/layers/attention/matrix_attention.py b/deep_qa/layers/attention/matrix_attention.py
@@ -80,24 +80,13 @@ def compute_output_shape(self, input_shape):
 
     @overrides
     def call(self, inputs, mask=None):
-        """
-        NOTE: This does not work if ``num_rows_1`` or ``num_rows_2`` is ``None``!  I tried to get
-        it to work, but ``K.dot()`` breaks.
-        """
         matrix_1, matrix_2 = inputs
-        matrix_1_shape = K.int_shape(matrix_1)
-        matrix_2_shape = K.int_shape(matrix_2)
-        num_rows_1 = matrix_1_shape[1]
-        num_rows_2 = matrix_2_shape[1]
-        tiled_matrix_1 = K.repeat_elements(K.expand_dims(matrix_1, axis=2), num_rows_2, axis=2)
-        tiled_matrix_2 = K.repeat_elements(K.expand_dims(matrix_2, axis=1), num_rows_1, axis=1)
-
-        # We need to be able to access K.int_shape() in compute_similarity() below, but in theano,
-        # calling a backend function makes it so you can't use K.int_shape() anymore.  Setting
-        # tensor._keras_shape here fixes that.
-        # pylint: disable=protected-access
-        tiled_matrix_1._keras_shape = matrix_1_shape[:2] + (num_rows_2,) + matrix_1_shape[2:]
-        tiled_matrix_2._keras_shape = matrix_2_shape[:1] + (num_rows_1,) + matrix_2_shape[1:]
+        num_rows_1 = K.shape(matrix_1)[1]
+        num_rows_2 = K.shape(matrix_2)[1]
+        tile_dims_1 = K.concatenate([[1, 1], [num_rows_2], [1]], 0)
+        tile_dims_2 = K.concatenate([[1], [num_rows_1], [1, 1]], 0)
+        tiled_matrix_1 = K.tile(K.expand_dims(matrix_1, axis=2), tile_dims_1)
+        tiled_matrix_2 = K.tile(K.expand_dims(matrix_2, axis=1), tile_dims_2)
         return self.similarity_function.compute_similarity(tiled_matrix_1, tiled_matrix_2)
 
     @overrides

diff --git a/deep_qa/layers/attention/weighted_sum.py b/deep_qa/layers/attention/weighted_sum.py
@@ -71,41 +71,22 @@ def compute_output_shape(self, input_shapes):
 
     @overrides
     def call(self, inputs, mask=None):
+        # pylint: disable=redefined-variable-type
         matrix, attention_vector = inputs
-        matrix_shape = K.int_shape(matrix)
-        matrix = self._expand_matrix_if_necessary(matrix, matrix_shape[:-1], attention_vector)
+        num_attention_dims = K.ndim(attention_vector)
+        num_matrix_dims = K.ndim(matrix) - 1
+        for _ in range(num_attention_dims - num_matrix_dims):
+            matrix = K.expand_dims(matrix, axis=1)
         if mask is None:
             matrix_mask = None
         else:
             matrix_mask = mask[0]
         if self.use_masking and matrix_mask is not None:
-            matrix_mask = self._expand_matrix_if_necessary(matrix_mask, matrix_shape[:-1], attention_vector)
-            # Doing a multiply here instead of a `switch` to avoid allocating another large tensor.
+            for _ in range(num_attention_dims - num_matrix_dims):
+                matrix_mask = K.expand_dims(matrix_mask, axis=1)
             matrix = K.cast(K.expand_dims(matrix_mask), 'float32') * matrix
         return K.sum(K.expand_dims(attention_vector, axis=-1) * matrix, -2)
 
-    @staticmethod
-    def _expand_matrix_if_necessary(matrix, matrix_shape, attention_vector):
-        """
-        This function gets the tiles the matrix to have the same shape as the attention vector,
-        ignoring the embedding dimension.  We take the shape as input (where the shape already has
-        the embedding dimension removed) so we can call this on the mask as well as the input
-        matrix.
-        """
-        attention_shape = K.int_shape(attention_vector)
-        if matrix_shape != attention_shape:
-            # We'll take care of the batch size first.  After this, the matrix_shape should match
-            # the end of the attention_shape exactly.
-            assert matrix_shape[0] == attention_shape[0], "somehow batch sizes don't match"
-            matrix_shape = matrix_shape[1:]
-            attention_shape = attention_shape[1:]
-            assert attention_shape[-len(matrix_shape):] == matrix_shape, ("matrix_shape must be "
-                                                                          "subset of attention_shape")
-            for i in range(len(attention_shape) - len(matrix_shape)):
-                matrix = K.expand_dims(matrix, axis=i+1)  # +1 to account for batch_size
-                matrix = K.repeat_elements(matrix, attention_shape[i], axis=i+1)
-        return matrix
-
     @overrides
     def get_config(self):
         base_config = super(WeightedSum, self).get_config()

diff --git a/deep_qa/layers/backend/__init__.py b/deep_qa/layers/backend/__init__.py
@@ -7,5 +7,6 @@
 from .permute import Permute
 from .multiply import Multiply
 from .repeat import Repeat
+from .repeat_like import RepeatLike
 from .squeeze import Squeeze
 from .add_mask import AddMask
diff --git a/deep_qa/layers/backend/repeat.py b/deep_qa/layers/backend/repeat.py
@@ -6,17 +6,25 @@
 
 class Repeat(MaskedLayer):
     """
-    This `Layer` calls `K.repeat_elements` on both the input and the mask, after calling
-    `K.expand_dims`.
+    This ``Layer`` calls ``K.repeat_elements`` on both the input and the mask, after calling
+    ``K.expand_dims``.
 
-    If the mask is not `None`, we must be able to call `K.expand_dims` using the same axis
+    If the mask is not ``None``, we must be able to call ``K.expand_dims`` using the same axis
     parameter as we do for the input.
 
     Input:
         - A tensor of arbitrary shape.
 
     Output:
         - The input tensor repeated along one of the dimensions.
+
+    Parameters
+    ----------
+    axis: int
+        We will add a dimension to the input tensor at this axis.
+    repetitions: int
+        The new dimension will have this size to it, with each slice being identical to the
+        original input tensor.
     """
     def __init__(self, axis: int, repetitions: int, **kwargs):
         self.axis = axis
@@ -28,15 +36,18 @@ def compute_mask(self, inputs, mask=None):
         # pylint: disable=unused-argument
         if mask is None:
             return None
-        return K.repeat_elements(K.expand_dims(mask, self.axis), self.repetitions, self.axis)
+        return self.__repeat_tensor(mask)
 
     @overrides
     def compute_output_shape(self, input_shape):
         return input_shape[:self.axis] + (self.repetitions,) + input_shape[self.axis:]
 
     @overrides
     def call(self, inputs, mask=None):
-        return K.repeat_elements(K.expand_dims(inputs, self.axis), self.repetitions, self.axis)
+        return self.__repeat_tensor(inputs)
+
+    def __repeat_tensor(self, tensor):
+        return K.repeat_elements(K.expand_dims(tensor, self.axis), self.repetitions, self.axis)
 
     @overrides
     def get_config(self):

diff --git a/deep_qa/layers/backend/repeat_like.py b/deep_qa/layers/backend/repeat_like.py
@@ -0,0 +1,63 @@
+from keras import backend as K
+from overrides import overrides
+
+from ..masked_layer import MaskedLayer
+
+
+class RepeatLike(MaskedLayer):
+    """
+    This ``Layer`` is like :class:`~.repeat.Repeat`, but gets the number of repetitions to use from
+    a second input tensor.  This allows doing a number of repetitions that is unknown at graph
+    compilation time, and is necessary when the ``repetitions`` argument to ``Repeat`` would be
+    ``None``.
+
+    If the mask is not ``None``, we must be able to call ``K.expand_dims`` using the same axis
+    parameter as we do for the input.
+
+    Input:
+        - A tensor of arbitrary shape, which we will expand and tile.
+        - A second tensor whose shape along one dimension we will copy
+
+    Output:
+        - The input tensor repeated along one of the dimensions.
+
+    Parameters
+    ----------
+    axis: int
+        We will add a dimension to the input tensor at this axis.
+    copy_from_axis: int
+        We will copy the dimension from the second tensor at this axis.
+    """
+    def __init__(self, axis: int, copy_from_axis: int, **kwargs):
+        self.axis = axis
+        self.copy_from_axis = copy_from_axis
+        super(RepeatLike, self).__init__(**kwargs)
+
+    @overrides
+    def compute_mask(self, inputs, mask=None):
+        # pylint: disable=unused-argument
+        if mask is None or mask[0] is None:
+            return None
+        return self.__repeat_tensor(mask[0], inputs[1])
+
+    @overrides
+    def compute_output_shape(self, input_shape):
+        return input_shape[0][:self.axis] + (input_shape[1][self.copy_from_axis],) + input_shape[0][self.axis:]
+
+    @overrides
+    def call(self, inputs, mask=None):
+        return self.__repeat_tensor(inputs[0], inputs[1])
+
+    def __repeat_tensor(self, to_repeat, to_copy):
+        expanded = K.expand_dims(to_repeat, self.axis)
+        ones = [1] * K.ndim(expanded)
+        num_repetitions = K.shape(to_copy)[self.copy_from_axis]
+        tile_shape = K.concatenate([ones[:self.axis], [num_repetitions], ones[self.axis+1:]], 0)
+        return K.tile(expanded, tile_shape)
+
+    @overrides
+    def get_config(self):
+        base_config = super(RepeatLike, self).get_config()
+        config = {'axis': self.axis, 'copy_from_axis': self.copy_from_axis}
+        config.update(base_config)
+        return config
diff --git a/deep_qa/layers/complex_concat.py b/deep_qa/layers/complex_concat.py
@@ -81,7 +81,10 @@ def _get_combination(self, combination: str, tensors: List['Tensor']):
             first_tensor = self._get_combination(combination[0], tensors)
             second_tensor = self._get_combination(combination[2], tensors)
             if K.int_shape(first_tensor) != K.int_shape(second_tensor):
-                raise ConfigurationError("Cannot combine two tensors with different shapes!")
+                shapes_message = "Shapes were: {} and {}".format(K.int_shape(first_tensor),
+                                                                 K.int_shape(second_tensor))
+                raise ConfigurationError("Cannot combine two tensors with different shapes!  " +
+                                         shapes_message)
             operation = combination[1]
             if operation == '*':
                 return first_tensor * second_tensor

diff --git a/deep_qa/models/reading_comprehension/bidirectional_attention.py b/deep_qa/models/reading_comprehension/bidirectional_attention.py
@@ -1,12 +1,12 @@
-from typing import Dict
+from typing import Dict, List
 
 from keras.layers import Dense, Input, Concatenate, TimeDistributed
 from overrides import overrides
 
 from ...data.instances.reading_comprehension import CharacterSpanInstance
 from ...layers import ComplexConcat, Highway
 from ...layers.attention import MatrixAttention, MaskedSoftmax, WeightedSum
-from ...layers.backend import Max, Repeat
+from ...layers.backend import Max, RepeatLike, Repeat
 from ...training import TextTrainer
 from ...training.models import DeepQaModel
 from ...common.params import Params
@@ -139,9 +139,9 @@ def _build_model(self):
 
         # Then he repeats this question/passage vector for every word in the passage, and uses it
         # as an additional input to the hidden layers above.
-        repeat_layer = Repeat(axis=1, repetitions=self.num_passage_words)
+        repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
         # Shape: (batch_size, num_passage_words, embedding_dim * 2)
-        tiled_question_passage_vector = repeat_layer(question_passage_vector)
+        tiled_question_passage_vector = repeat_layer([question_passage_vector, encoded_passage])
 
         # Shape: (batch_size, num_passage_words, embedding_dim * 8)
         complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage')
@@ -176,9 +176,10 @@ def _build_model(self):
         # his figure makes it clear this is what he intended; he just wrote the equations wrong).
         # Shape: (batch_size, num_passage_words, embedding_dim * 2)
         sum_layer = WeightedSum(name="passage_weighted_by_predicted_span", use_masking=False)
-        repeat_layer = Repeat(axis=1, repetitions=self.num_passage_words)
-        passage_weighted_by_predicted_span = repeat_layer(sum_layer([modeled_passage,
-                                                                     span_begin_probabilities]))
+        repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
+        passage_weighted_by_predicted_span = repeat_layer([sum_layer([modeled_passage,
+                                                                      span_begin_probabilities]),
+                                                           encoded_passage])
         span_end_representation = ComplexConcat(combination="1,2,3,2*3")([final_merged_passage,
                                                                           modeled_passage,
                                                                           passage_weighted_by_predicted_span])
@@ -204,13 +205,10 @@ def _get_padding_lengths(self) -> Dict[str, int]:
 
     @overrides
     def _set_padding_lengths(self, padding_lengths: Dict[str, int]):
-        # Adding this because we're bypassing num_sentence_words in our model, but TextTrainer
-        # expects it.
-        padding_lengths['num_sentence_words'] = None
         super(BidirectionalAttentionFlow, self)._set_padding_lengths(padding_lengths)
-        if self.num_passage_words is None:
+        if not self.use_dynamic_padding and self.num_passage_words is None:
             self.num_passage_words = padding_lengths['num_passage_words']
-        if self.num_question_words is None:
+        if not self.use_dynamic_padding and self.num_question_words is None:
             self.num_question_words = padding_lengths['num_question_words']
 
     @overrides
@@ -224,6 +222,9 @@ def _set_padding_lengths_from_model(self):
         # self.num_sentence_words.
         self._set_text_lengths_from_model_input(self.model.get_input_shape_at(0)[1][1:])
 
+    def _get_instance_sorting_keys(self) -> List[str]:  # pylint: disable=no-self-use
+        return ['num_passage_words', 'num_question_words']
+
     @classmethod
     def _get_custom_objects(cls):
         custom_objects = super(BidirectionalAttentionFlow, cls)._get_custom_objects()
@@ -232,6 +233,7 @@ def _get_custom_objects(cls):
         custom_objects["MatrixAttention"] = MatrixAttention
         custom_objects["Max"] = Max
         custom_objects["Repeat"] = Repeat
+        custom_objects["RepeatLike"] = RepeatLike
         custom_objects["WeightedSum"] = WeightedSum
         return custom_objects
 

diff --git a/deep_qa/training/text_trainer.py b/deep_qa/training/text_trainer.py
@@ -535,7 +535,7 @@ def _get_padding_lengths(self) -> Dict[str, int]:
 
     def _set_padding_lengths(self, dataset_padding_lengths: Dict[str, int]):
         """
-        This is about padding.  Any solver will have some number of things that need padding in
+        This is about padding.  Any model will have some number of things that need padding in
         order to make a consistent set of input arrays, like the length of a sentence.  This method
         sets those variables given a dictionary of lengths from a dataset.
 
@@ -544,7 +544,7 @@ def _set_padding_lengths(self, dataset_padding_lengths: Dict[str, int]):
         set a hard limit in the class parameters and don't want to change it.
         """
         if not self.use_dynamic_padding and self.num_sentence_words is None:
-            self.num_sentence_words = dataset_padding_lengths['num_sentence_words']
+            self.num_sentence_words = dataset_padding_lengths.get('num_sentence_words', None)
         if not self.use_dynamic_padding and self.num_word_characters is None:
             self.num_word_characters = dataset_padding_lengths.get('num_word_characters', None)
 

diff --git a/deep_qa/training/trainer.py b/deep_qa/training/trainer.py
@@ -272,9 +272,11 @@ def train(self):
         self.training_arrays = self.create_data_arrays(indexed_training_dataset)
 
         if self.validation_files:
-            self.validation_dataset, self.validation_arrays = self.load_data_arrays(self.validation_files)
+            self.validation_dataset, self.validation_arrays = self.load_data_arrays(self.validation_files,
+                                                                                    self.max_validation_instances)
         if self.test_files:
-            self.test_dataset, self.test_arrays = self.load_data_arrays(self.test_files)
+            self.test_dataset, self.test_arrays = self.load_data_arrays(self.test_files,
+                                                                        self.max_test_instances)
 
         # Then we build the model and compile it.
         logger.info("Building the model")

diff --git a/doc/layers/backend.rst b/doc/layers/backend.rst
@@ -67,3 +67,11 @@ Repeat
     :members:
     :undoc-members:
     :show-inheritance:
+
+RepeatLike
+----------
+
+.. automodule:: deep_qa.layers.backend.repeat_like
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/example_experiments/reading_comprehension/bidaf_squad.json b/example_experiments/reading_comprehension/bidaf_squad.json
@@ -27,7 +27,8 @@
       "type": "adadelta",
       "lr": 0.5
     },
-    "max_training_instances": 100,
-    "validation_files": ["/efs/data/dlfa/sciq_da/processed/dev.tsv"],
-    "train_files": ["/efs/data/dlfa/sciq_da/processed/train.tsv"]
+    "use_data_generator": true,
+    "use_dynamic_padding": true,
+    "validation_files": ["/net/efs/aristo/dlfa/squad/processed/dev.tsv"],
+    "train_files": ["/net/efs/aristo/dlfa/squad/processed/train.tsv"]
 }
diff --git a/tests/layers/backend/repeat_like_test.py b/tests/layers/backend/repeat_like_test.py
@@ -0,0 +1,23 @@
+# pylint: disable=no-self-use,invalid-name
+
+import numpy
+from keras.layers import Input
+from keras.models import Model
+
+from deep_qa.layers.backend import RepeatLike
+
+class TestRepeatLikeLayer:
+    def test_call_works_on_simple_input(self):
+        batch_size = 2
+        input_length = 3
+        repetitions = 4
+        input_layer = Input(shape=(input_length,), dtype='float32')
+        input_layer_2 = Input(shape=(None,), dtype='float32')
+        repeat_output = RepeatLike(axis=1, copy_from_axis=1)([input_layer, input_layer_2])
+        model = Model(inputs=[input_layer, input_layer_2], outputs=[repeat_output])
+        input_tensor = numpy.asarray([[2, 5, 3], [-1, -4, -2]])
+        input_tensor_2 = numpy.ones((batch_size, repetitions))
+        repeat_tensor = model.predict([input_tensor, input_tensor_2])
+        assert repeat_tensor.shape == (batch_size, repetitions, input_length)
+        for i in range(repetitions):
+            numpy.testing.assert_almost_equal(repeat_tensor[:, i, :], [[2, 5, 3], [-1, -4, -2]])