CNTK v2 library: Sequence to sequence python example

coderye · Sep 6, 2016 · 8cff694 · 8cff694
1 parent fa12c5e
commit 8cff694
Show file tree

Hide file tree

Showing 4 changed files with 228 additions and 109 deletions.
diff --git a/bindings/python/cntk/ops/__init__.py b/bindings/python/cntk/ops/__init__.py
@@ -4,6 +4,7 @@
 # ==============================================================================
 
 import numpy as np
+from . import sequence
 from ..utils import sanitize_input, sanitize_shape, get_data_type
 
 def combine(operands, name=''):
@@ -874,8 +875,7 @@ def reciprocal(x, name=''):
     x = sanitize_input(x)
     return reciprocal(x, name).output()    
 
-#TODO: enable when it is exposed in c++
-def cond(flag, value_if_true, value_if_false, name=''):
+def element_select(flag, value_if_true, value_if_false, name=''):
     '''
     return either value_if_true or value_if_false based on the value of flag.
     If flag != 0 value_if_true is returned, otherwise value_if_false.
@@ -894,7 +894,11 @@ def cond(flag, value_if_true, value_if_false, name=''):
     Returns:
         :class:`cntk.Function`
     '''    
-    raise NotImplementedError("cond is not implemented yet in V2")
+    from cntk import element_select
+    flag = sanitize_input(flag)
+    value_if_true = sanitize_input(value_if_true)
+    value_if_false = sanitize_input(value_if_false)
+    return element_select(flag, value_if_true, value_if_false, name).output()    
 
 ################################################################################
 # recurrent ops
@@ -1232,7 +1236,7 @@ def placeholder_variable(shape, dynamic_axes = [Axis.default_dynamic_axis(), Axi
     '''
     from cntk import placeholder_variable
     shape = sanitize_shape(shape)
-    return placeholder_variable(shape)
+    return placeholder_variable(shape, dynamic_axes)
 
 def parameter(shape=None, value=None, device=None, name=''):
     '''

diff --git a/bindings/python/cntk/ops/sequence/__init__.py b/bindings/python/cntk/ops/sequence/__init__.py
@@ -0,0 +1,144 @@
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+from ...utils import sanitize_input, sanitize_shape, get_data_type
+
+################################################################################
+# sequence ops
+################################################################################
+def is_first(operand, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import is_first
+    operand = sanitize_input(operand, get_data_type(operand))
+    return is_first(operand, name).output()
+
+def is_last(operand, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import is_last
+    operand = sanitize_input(operand, get_data_type(operand))
+    return is_last(operand, name).output()
+
+def first(operand, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import first
+    operand = sanitize_input(operand, get_data_type(operand))
+    return first(operand, name).output()
+
+def last(operand, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import last
+    operand = sanitize_input(operand, get_data_type(operand))
+    return last(operand, name).output()
+
+def where(condition, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        condition: the symbolic tensor operand denoting a boolean condition flag for each step of a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import where
+    condition = sanitize_input(condition, get_data_type(condition))
+    return where(condition, name).output()
+
+def gather(operand, condition, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        condition: the symbolic tensor operand denoting a boolean condition flag for each step of a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import gather
+    operand = sanitize_input(operand, get_data_type(operand))
+    condition = sanitize_input(condition, get_data_type(condition))
+    return gather(operand, condition, name).output()
+
+def scatter(operand, condition, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a sequence
+        condition: the symbolic tensor operand denoting a boolean condition flag for each step of a sequence
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import scatter
+    operand = sanitize_input(operand, get_data_type(operand))
+    condition = sanitize_input(condition, get_data_type(condition))
+    return scatter(operand, condition, name).output()
+
+def broadcast_as(operand, broadcast_as_operand, name = ''):
+    '''
+    TBA
+        
+    Example:
+        TBA
+    Args:        
+        operand: the symbolic tensor operand denoting a tensor
+        broadcast_as_operand: the symbolic tensor operand denoting a sequence per whose layout the main operand id to be broadcast
+        name (str): the name of the node in the network
+    Returns:
+        :class:`cntk.Function`
+    '''    
+    from cntk import broadcast_as
+    operand = sanitize_input(operand, get_data_type(operand))
+    broadcast_as_operand = sanitize_input(broadcast_as_operand, get_data_type(broadcast_as_operand))
+    return broadcast_as(operand, broadcast_as_operand, name).output()
diff --git a/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py b/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
@@ -7,10 +7,11 @@
 import numpy as np
 import sys
 import os
+import math
 import time
-from cntk import learning_rates_per_sample, DeviceDescriptor, Trainer, sgdlearner, Axis, get_train_loss, get_train_eval_criterion
-from cntk.ops import variable, cross_entropy_with_softmax, classification_error
-from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, fully_connected_linear_layer, select_last
+from cntk import learning_rates_per_sample, momentums_per_sample, DeviceDescriptor, Trainer, momentum_sgdlearner, Axis, text_format_minibatch_source, StreamConfiguration, print_training_progress
+from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, slice, past_value, future_value, element_select
+from examples.common.nn import LSTMP_component_with_self_stabilization, stabilize, linear_layer
 
 # Creates and trains a sequence to sequence translation model
 def train_sequence_to_sequence_translator():
@@ -28,124 +29,91 @@ def train_sequence_to_sequence_translator():
     label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ]
     raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes)
 
+    # Instantiate the sequence to sequence translation model
     input_sequence = raw_input
 
     # Drop the sentence start token from the label, for decoder training
-    label_sequence = cntk.ops.slice(raw_labels, label_dynamic_axes[0], 1, 0)
-    label_sentence_start = Sequence.first(raw_labels)
+    label_sequence = slice(raw_labels, label_dynamic_axes[0], 1, 0)
+    label_sentence_start = sequence.first(raw_labels)
 
-    is_first_label = Sequence.is_first(label_sequence)
-
-    label_sentence_start_scattered = Sequence.scatter(label_sentence_start, is_first_label)
+    is_first_label = sequence.is_first(label_sequence)
+    label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label)
 
     # Encoder
-    encoderOutputH = stabilize<float>(inputEmbedding, device)
-    futureValueRecurrenceHook = [](const Variable& x) { return FutureValue(x) }
-    for (size_t i = 0 i < num_layers ++i)
-        std::tie(encoderOutputH, encoderOutputC) = LSTMPComponentWithSelfStabilization<float>(encoderOutputH, hidden_dim, hidden_dim, futureValueRecurrenceHook, futureValueRecurrenceHook, device)
-
-    thoughtVectorH = Sequence::First(encoderOutputH)
-    thoughtVectorC = Sequence::First(encoderOutputC)
-
-    thoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding)
-    thoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding)
-
-    /* Decoder */
-    bool addBeamSearchReorderingHook = false
-    beamSearchReorderHook = Constant({ 1, 1 }, 1.0f)
-    decoderHistoryFromGroundTruth = labelEmbedding
-    decoderInput = ElementSelect(is_first_label, label_sentence_startEmbeddedScattered, PastValue(decoderHistoryFromGroundTruth))
-
-    decoderOutputH = Stabilize<float>(decoderInput, device)
-    FunctionPtr decoderOutputC
-    pastValueRecurrenceHookWithBeamSearchReordering = [addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
-        return PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand)
-    }
-
-    for (size_t i = 0 i < num_layers ++i)
-    {
-        std::function<FunctionPtr(const Variable&)> recurrenceHookH, recurrenceHookC
-        if (i == 0)
-        {
-            recurrenceHookH = pastValueRecurrenceHookWithBeamSearchReordering
-            recurrenceHookC = pastValueRecurrenceHookWithBeamSearchReordering
-        }
-        else
-        {
-            isFirst = Sequence::IsFirst(labelEmbedding)
-            recurrenceHookH = [labelEmbedding, thoughtVectorBroadcastH, isFirst, addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
-                return ElementSelect(isFirst, thoughtVectorBroadcastH, PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand))
-            }
-
-            recurrenceHookC = [labelEmbedding, thoughtVectorBroadcastC, isFirst, addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
-                return ElementSelect(isFirst, thoughtVectorBroadcastC, PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand))
-            }
-        }
-
-        std::tie(decoderOutputH, encoderOutputC) = LSTMPComponentWithSelfStabilization<float>(decoderOutputH, hidden_dim, hidden_dim, recurrenceHookH, recurrenceHookC, device)
-    }
-
-    decoderOutput = decoderOutputH
-    decoderDim = hidden_dim
-
-    /* Softmax output layer */
-    outputLayerProjWeights = Parameter(NDArrayView::RandomUniform<float>({ label_vocab_dim, decoderDim }, -0.05, 0.05, 1, device))
-    biasWeights = Parameter({ label_vocab_dim }, 0.0f, device)
-
-    z = Plus(Times(outputLayerProjWeights, Stabilize<float>(decoderOutput, device)), biasWeights, L"classifierOutput")
-    ce = CrossEntropyWithSoftmax(z, label_sequence, L"lossFunction")
-    errs = ClassificationError(z, label_sequence, L"classificationError")
-
-
-
-
-    input_dim = 2000
-    cell_dim = 25
-    hidden_dim = 25
-    embedding_dim = 50
-    num_output_classes = 5
-
-    # Input variables denoting the features and label data
-    features = variable(shape=input_dim, is_sparse=True, name="features")
-    label = variable(num_output_classes, dynamic_axes = [Axis.default_batch_axis()], name="labels")
-
-    # Instantiate the sequence classification model
-    classifier_output = LSTM_sequence_classifer_net(features, num_output_classes, embedding_dim, hidden_dim, cell_dim)
-
-    ce = cross_entropy_with_softmax(classifier_output, label)
-    pe = classification_error(classifier_output, label)
-
-    rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf"
+    encoder_outputH = stabilize(input_sequence)
+    for i in range(0, num_layers):
+        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(encoder_outputH, hidden_dim, hidden_dim, future_value, future_value)
+
+    thought_vectorH = sequence.first(encoder_outputH)
+    thought_vectorC = sequence.first(encoder_outputC)
+
+    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence)
+    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence)
+
+    decoder_history_from_ground_truth = label_sequence
+    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth))
+
+    decoder_outputH = stabilize(decoder_input)
+    for i in range(0, num_layers):
+        if (i == 0):
+            recurrence_hookH = past_value
+            recurrence_hookC = past_value
+        else:
+            isFirst = sequence.is_first(label_sequence)
+            recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand))
+            recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand))
+
+        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(decoder_outputH, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)
+
+    decoder_output = decoder_outputH
+    decoder_dim = hidden_dim
+
+    # Softmax output layer
+    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
+    ce = cross_entropy_with_softmax(z, label_sequence)
+    errs = classification_error(z, label_sequence)
+
+    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
+    feature_stream_name = 'features'
+    labels_stream_name = 'labels'
 
-    mb_source = text_minibatch_source(path, [ ( 'features', input_dim, True, 'x' ), ( 'labels', num_output_classes, False, 'y' ) ], 0)
-    features_si = mb_source.stream_info(features)
-    labels_si = mb_source.stream_info(label)
+    mb_source = text_format_minibatch_source(path, list([ 
+                    StreamConfiguration( feature_stream_name, input_vocab_dim, True, 'S0' ), 
+                    StreamConfiguration( labels_stream_name, label_vocab_dim, True, 'S1') ]), 10000)
+    features_si = mb_source.stream_info(feature_stream_name)
+    labels_si = mb_source.stream_info(labels_stream_name)
 
     # Instantiate the trainer object to drive the model training
-    lr = lr = learning_rates_per_sample(0.0005)
-    trainer = Trainer(classifier_output, ce, pe, [sgdlearner(classifier_output.owner.parameters(), lr)])                   
+    lr = learning_rates_per_sample(0.007)
+    momentum_time_constant = 1100
+    momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant))
+    clipping_threshold_per_sample = 2.3
+    gradient_clipping_with_truncation = True
+
+    trainer = Trainer(z, ce, errs, [momentum_sgdlearner(z.owner.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation)])                   
 
     # Get minibatches of sequences to train with and perform model training
-    minibatch_size = 200
+    minibatch_size = 72
     training_progress_output_freq = 1  
-    i = 0
     while True:
         mb = mb_source.get_next_minibatch(minibatch_size)
         if  len(mb) == 0:
             break
 
         # Specify the mapping of input variables in the model to actual minibatch data to be trained with
-        arguments = {features : mb[features_si].m_data, label : mb[labels_si].m_data}
+        arguments = {raw_input : mb[features_si].m_data, raw_labels : mb[labels_si].m_data}
         trainer.train_minibatch(arguments)
 
-        print_training_progress(training_progress_output_freq, i, trainer)
+        print_training_progress(i, trainer, training_progress_output_freq)
 
         i += 1
 
-if __name__=='__main__':    
+if __name__=='__main__':
+
+    #time.sleep(10)
     # Specify the target device to be used for computing
     target_device = DeviceDescriptor.cpu_device()
     DeviceDescriptor.set_default_device(target_device)
 
-    train_sequence_classifier()
+    train_sequence_to_sequence_translator()