From ea309aa1eeed6958c7dc2e5ffa7316509eb3dc77 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 14 Dec 2015 17:33:58 -0800
Subject: [PATCH 01/49] Replace CreateMatrixIfNull by MarkValueNonsharable()

In the compiling the stage, we will mark nodes as nonsharable whose descendents are all learnable parameters.
---
 .../CompositeComputationNodes.h               |  6 ++--
 .../ComputationNetwork.h                      |  1 +
 .../ComputationNetworkEvaluation.cpp          | 29 +++++++++++++++++++
 .../ComputationNode.h                         | 23 +++++++++++++--
 .../CNTKComputationNetworkLib/EsotericNodes.h |  6 ++--
 .../InputAndParamNodes.h                      |  6 ++--
 .../RecurrentNodes.h                          |  3 +-
 7 files changed, 65 insertions(+), 9 deletions(-)
diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 6d983a9784fe..8b11e37233e0 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -233,7 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void MarkComputed(const bool hasComputed)
         {
             m_hasComputed = hasComputed;
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
         }
 
         virtual bool RequiresPreCompute() const override { return true; }
@@ -292,7 +293,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
         virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
         {
-            CreateMatrixIfNull(m_value);
+            //CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             m_value->SetValue(value);
             m_hasComputed = true; 
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 45093c903cb2..cc202c83e35b 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -159,6 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 private:
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
     void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode);
 private:
     void DetermineSetOfAllRoots();
     void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
index f0b8a78dcd82..5baae8553fb4 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -413,6 +413,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (auto & node : m_allRoots)
             ValidateSubNetwork(node);
 
+        // STEP:  mark non-sharable function values 
+        // if all the descendants of a particular node are learnable parameters, 
+        // its function value is not sharable 
+        for (auto & node : m_allRoots)
+            MarkValueNonSharableNodes(node);
+       
+
         // STEP: Optimize the network.
         // :)
 
@@ -678,6 +685,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
+    void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
+    {
+        const auto & nodes = GetEvalOrder(rootNode);
+        for (auto& node : nodes)
+        {
+            auto children = node->GetInputs(); 
+            bool allChildrenNonSharable = true; 
+            for (auto& child : children)
+            {
+                if (child->isValueSharable())
+                {
+                    allChildrenNonSharable = false; 
+                    break;
+                }
+            }
+            if (allChildrenNonSharable)
+                node->MarkValueNonSharable(); 
+        }
+        
+    }
+
 #if 0
     // prepare to compute with the subnetwork that this rootNode depends on, including
     //  - auto-detecting recurrent loops
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index ab12cdf8707d..55546f00894b 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -246,7 +246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
             m_parameterUpdateRequired(false), m_gradientInitialized(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0)
+            m_numRows(0), m_numCols(0), m_valueSharable(true)
         { }
         virtual ~ComputationNodeBase(){}
 
@@ -428,6 +428,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
+        bool isValueSharable() 
+        {
+            return m_valueSharable; 
+        }
+        virtual void MarkValueNonSharable()
+        {
+            m_valueSharable = false; 
+        }
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
 
@@ -760,6 +768,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_parameterUpdateRequired;     // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
         bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
         bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
+
+        // flags related with sharable values 
+        bool    m_valueSharable; // whether value is sharable 
     };
     typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
@@ -807,7 +818,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Since the dimensions are read as well, this function also updates m_numRows/m_numCols.
         void LoadValue(File& fstream)
         {
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable(); 
             fstream >> Value();
             // above reads dimensions, so we must update our own m_numRows/m_numCols
             m_numRows = Value().GetNumRows();
@@ -1293,6 +1305,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_gradient);
         }
 
+        void MarkValueNonSharable() override
+        {
+            m_valueSharable = false; 
+            CreateMatrixIfNull(m_value);
+        }
+
+
     protected:
 
         // this function is used to create matrices for those needed before matrix pool is available
diff --git a/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h b/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h
index ab4f69592f03..03b5a7c2aaa8 100644
--- a/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h
@@ -653,7 +653,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         void Init(size_t row_size, size_t col_size)
         {
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             SetDims(row_size, col_size);
             UpdateFunctionValuesSize();
         }
@@ -663,7 +664,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             Init(row_size, col_size);
-            CreateMatrixIfNull(m_gradient);
+            //CreateMatrixIfNull(m_gradient);
+            MarkValueNonSharable();
             m_gradient->Resize(row_size, col_size);
             m_gradient->SetValue(0.0f);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 0bcaeeb3a170..dab53fb0a86c 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -48,7 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_parameterUpdateRequired = true;
             m_sampleLayout = ImageLayoutWHC(1, rows, 1);
             // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors.
-            CreateMatrixIfNull(m_value);
+            //CreateMatrixIfNull(m_value); 
+            MarkValueNonSharable();
             SetDims(rows, cols);
             UpdateFunctionValuesSize();   // this allocates the matrix
             Value().SetValue(0);
@@ -235,7 +236,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(size_t rows, size_t cols, bool isSparse)
         {
             m_isSparse = isSparse;
-            CreateMatrixIfNull(m_value);
+            //CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index eb940a75ee46..a59d89bff6a6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -90,7 +90,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             m_initialActivationValue = initialActivationValue;
             m_timeStep = 1;
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             SetDims(row_size, col_size);
             m_isHistoryCarryOverManagedExternally = false;      // used for PairNetworkNode/PastValueNode combination
         }

From 2e49d617d6518f747ed1140b034d2e58b68fb4aa Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Wed, 16 Dec 2015 15:54:08 -0800
Subject: [PATCH 02/49] Revise the implementation of valueNotSharableNode. More
 to be revised.

---
 .../ComputationNetworkEvaluation.cpp          | 47 ++++++++++++++++---
 .../ComputationNetworkLib/ComputationNode.h   | 24 +++++-----
 .../InputAndParamNodes.h                      |  2 +
 3 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 5baae8553fb4..b983cd489825 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -15,6 +15,7 @@
 #include <list>
 #include <set>
 #include <algorithm>
+#include <map>
 
 using namespace std;
 
@@ -689,20 +690,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
     {
         const auto & nodes = GetEvalOrder(rootNode);
+        std::map<wstring, bool>    allLeafDescendentsAreParameters; 
         for (auto& node : nodes)
         {
             auto children = node->GetInputs(); 
-            bool allChildrenNonSharable = true; 
-            for (auto& child : children)
+            wstring myname = node->NodeName();
+            bool allParameters = true; 
+                        
+            if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already 
             {
-                if (child->isValueSharable())
+                for (auto child : children)
                 {
-                    allChildrenNonSharable = false; 
-                    break;
+                    wstring ChildName = child->NodeName();
+                    if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
+                    {
+                        // not found, means it is a leaf node (we are at eval order )
+                        assert(child->IsLeaf());
+                        if (node->isLearnableParameter())
+                        {
+                            allLeafDescendentsAreParameters[ChildName] = true; 
+                        }
+                        else
+                        {
+                            allParameters = false; 
+                            allLeafDescendentsAreParameters[ChildName] = false;
+                            break;
+                        }                      
+                    }
+                    else
+                    {
+                        if (allLeafDescendentsAreParameters[ChildName] == false)
+                        {
+                            allParameters = false;
+                            break;
+                        }
+                    }
+                }
+                allLeafDescendentsAreParameters[myname] = allParameters;
+                if (allParameters)
+                {
+                    node->MarkValueNonSharable();
+                }
+                else
+                {
+                    node->MarkValueSharable();
                 }
             }
-            if (allChildrenNonSharable)
-                node->MarkValueNonSharable(); 
         }
         
     }
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 31154b634f4e..81d1b640a5b8 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -148,7 +148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         friend class ComputationNetwork;
 
         ComputationNetworkOwnedNodeState() :
-            m_needsGradient(false)
+            m_needsGradient(false), m_valueSharable(true)
         {
             PurgeStateForFormingRecurrentLoops();
             m_isPartOfLoop = false;
@@ -163,10 +163,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         bool IsPartOfLoop() const { return m_isPartOfLoop; }
 
+        virtual void MarkValueNonSharable(){ m_valueSharable = false; }
+        virtual void MarkValueSharable() { m_valueSharable = true;    }
+        bool isValueSharable() { return m_valueSharable;  }
+        
     protected:  // TODO: should be fully encapsulated here
 
         bool m_needsGradient;   // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
 
+        bool m_valueSharable;   // a flag is needed for memory share. 
+                                // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), 
+                                // it will never be released to memory pool 
     private:
 
         bool m_isPartOfLoop;        // true if this loop is part of a recurrent loop
@@ -247,7 +254,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
             m_parameterUpdateRequired(false), m_gradientInitialized(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0), m_valueSharable(true)
+            m_numRows(0), m_numCols(0) 
         { }
         virtual ~ComputationNodeBase(){}
 
@@ -429,14 +436,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
-        bool isValueSharable() 
-        {
-            return m_valueSharable; 
-        }
-        virtual void MarkValueNonSharable()
-        {
-            m_valueSharable = false; 
-        }
+        // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter)
+        virtual bool isLearnableParameter() const { return false; }
+
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
 
@@ -770,8 +772,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
         bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
 
-        // flags related with sharable values 
-        bool    m_valueSharable; // whether value is sharable 
     };
     typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index dab53fb0a86c..d2e0f8039763 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -180,6 +180,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             PrintNodeValuesToFile(printValues, fstream);
         }
+
+        virtual bool isLearnableParameter()const override{ return true; }
     };
 
 #if 0

From 8eae46b59df4973d508f5ac3eaa8dd434b6f0a35 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Thu, 17 Dec 2015 17:44:22 -0800
Subject: [PATCH 03/49] Fix MarkValueNotSharableNodes

---
 .../ComputationNetwork.h                      |  2 +-
 .../ComputationNetworkEvaluation.cpp          | 90 +++++++++----------
 .../ComputationNetworkLib/ComputationNode.h   |  6 +-
 .../InputAndParamNodes.h                      |  1 -
 4 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index cc202c83e35b..5b3a0f16adf5 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -159,7 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 private:
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
     void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
-    void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes();
 private:
     void DetermineSetOfAllRoots();
     void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index b983cd489825..f8b0b0cabd8c 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -10,6 +10,7 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "RecurrentNodes.h"
+#include "InputAndParamNodes.h"
 #include <string>
 #include <vector>
 #include <list>
@@ -414,13 +415,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (auto & node : m_allRoots)
             ValidateSubNetwork(node);
 
-        // STEP:  mark non-sharable function values 
-        // if all the descendants of a particular node are learnable parameters, 
-        // its function value is not sharable 
-        for (auto & node : m_allRoots)
-            MarkValueNonSharableNodes(node);
-       
-
         // STEP: Optimize the network.
         // :)
 
@@ -686,11 +680,48 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+#if 0
+    // prepare to compute with the subnetwork that this rootNode depends on, including
+    //  - auto-detecting recurrent loops
+    //  - collect input and learnable nodes
+    //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
+    // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already.
+    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
+    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
+    {
+        bool inserted = m_built.insert(rootNode).second;  // remember we built it
+        if (!inserted)
+            return;                                             // already done
+
+        // detect recurrent loops for this root node
+        // TODO: not nice--why not always call this in ValidateSubNetwork() only?
+        FormRecurrentLoops(rootNode);
+
+        // for the m_inputValues and m_learnableParameters sets for this rootNode
+        CollectInputAndLearnableParameters(rootNode);
+
+        // validate the rootNode and all nodes it depends on, in evaluation order
+        ValidateSubNetwork(rootNode);
+    }
+
+    // tests whether BuildAndValidateSubNetwork() was called
+    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
+    {
+        return m_built.find(rootNode) != m_built.end();
+    }
+#endif
+
+    // -----------------------------------------------------------------------
+    // memory allocation
+    // -----------------------------------------------------------------------
     // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
-    void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
+    void ComputationNetwork::MarkValueNonSharableNodes()
     {
-        const auto & nodes = GetEvalOrder(rootNode);
+        const auto & nodes = GetEvalOrder(nullptr);
         std::map<wstring, bool>    allLeafDescendentsAreParameters; 
+        std::list<ComputationNodeBasePtr>    allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); 
+        // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not 
+
         for (auto& node : nodes)
         {
             auto children = node->GetInputs(); 
@@ -706,7 +737,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         // not found, means it is a leaf node (we are at eval order )
                         assert(child->IsLeaf());
-                        if (node->isLearnableParameter())
+                        if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
                         {
                             allLeafDescendentsAreParameters[ChildName] = true; 
                         }
@@ -740,40 +771,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         
     }
 
-#if 0
-    // prepare to compute with the subnetwork that this rootNode depends on, including
-    //  - auto-detecting recurrent loops
-    //  - collect input and learnable nodes
-    //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
-    // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already.
-    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
-    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
-    {
-        bool inserted = m_built.insert(rootNode).second;  // remember we built it
-        if (!inserted)
-            return;                                             // already done
-
-        // detect recurrent loops for this root node
-        // TODO: not nice--why not always call this in ValidateSubNetwork() only?
-        FormRecurrentLoops(rootNode);
-
-        // for the m_inputValues and m_learnableParameters sets for this rootNode
-        CollectInputAndLearnableParameters(rootNode);
-
-        // validate the rootNode and all nodes it depends on, in evaluation order
-        ValidateSubNetwork(rootNode);
-    }
-
-    // tests whether BuildAndValidateSubNetwork() was called
-    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
-    {
-        return m_built.find(rootNode) != m_built.end();
-    }
-#endif
-
-    // -----------------------------------------------------------------------
-    // memory allocation
-    // -----------------------------------------------------------------------
 
     // this function will need to be called before actual validation and execution to 
     // predetermine how to share matrices to reduce memory usage.
@@ -788,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         VerifyIsCompiled("AllocateAllMatrices");
 
+        // Due to special topology, if a node is solely induced by parameters, its function value should not be shared  
+        MarkValueNonSharableNodes();
+
         bool performingBackPropagation = (trainRootNode != nullptr);
 
-        // Create a composite Eval order with the specfied nodes as roots
+        // Create a composite Eval order with the specified nodes as roots
         std::vector<ComputationNodeBasePtr> forwardPropRoots;
         forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
         forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 81d1b640a5b8..45ceacb7ffd3 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void MarkValueNonSharable(){ m_valueSharable = false; }
         virtual void MarkValueSharable() { m_valueSharable = true;    }
-        bool isValueSharable() { return m_valueSharable;  }
+        bool isValueSharable() const { return m_valueSharable;  }
         
     protected:  // TODO: should be fully encapsulated here
 
@@ -436,8 +436,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
-        // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter)
-        virtual bool isLearnableParameter() const { return false; }
 
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
@@ -518,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
         bool IsOutputNeededDuringBackprop() const 
         {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable();
         }
 
         virtual void /*IComputationNode::*/InferImageDimsFromInputs()
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index d2e0f8039763..eb4b2f1bf3fe 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -181,7 +181,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             PrintNodeValuesToFile(printValues, fstream);
         }
 
-        virtual bool isLearnableParameter()const override{ return true; }
     };
 
 #if 0

From 88be36aaf0ae753afdba65346c69c8f237b7aee1 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Fri, 18 Dec 2015 14:21:21 -0800
Subject: [PATCH 04/49] Revise the condition of
 ReleaseMatricesAfterForwardProp: only ValueSharable nodes can be released
 after forwardprop

---
 Source/ComputationNetworkLib/ComputationNode.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 45ceacb7ffd3..9314bc88abea 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -516,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
         bool IsOutputNeededDuringBackprop() const 
         {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable();
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ;
         }
 
         virtual void /*IComputationNode::*/InferImageDimsFromInputs()
@@ -905,7 +905,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //don't release matrices that need to be used in the gradient computation
         virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
         {
-            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE))
+            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
                 ReleaseMatrixToPool(m_value, matrixPool);
         }
 

From 63eca285983e016fda12131e8a05854a35dec497 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Fri, 18 Dec 2015 23:32:59 -0800
Subject: [PATCH 05/49] Fix a bug in MarkValueSharableNode

---
 Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp | 2 +-
 Source/ComputationNetworkLib/ComputationNode.h                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index f8b0b0cabd8c..3b8f95064282 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -736,7 +736,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
                     {
                         // not found, means it is a leaf node (we are at eval order )
-                        assert(child->IsLeaf());
+                        assert(child->IsLeaf() || child->IsPartOfLoop());
                         if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
                         {
                             allLeafDescendentsAreParameters[ChildName] = true; 
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 9314bc88abea..f2eb7747ed0f 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -934,7 +934,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 // Release the Value matrix only if the output value is needed during backprop
                 // since in the case it isn't used, we release it during forward prop itself
-                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE)
+                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
                     ReleaseMatrixToPool(m_value, matrixPool);
             }
         }

From 0005c81a76dfc8d428d31dbb8f10ec3267ec8014 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Sat, 19 Dec 2015 00:18:13 -0800
Subject: [PATCH 06/49] Add an alternate option "numSubminibatches" for users
 to indicate how to split minibatches into subminibatches.

---
 Source/SGDLib/SGD.cpp | 25 ++++++++++++++++++-------
 Source/SGDLib/SGD.h   |  6 +++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 27998d7d6bd0..e574c93ec51c 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -764,13 +764,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
         DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
         size_t numSubminibatchesNeeded = 0; 
-        if (m_maxSamplesInRAM < SIZE_MAX)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
+        if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
         {
-            // into how many pieces would we need to break the minibatch?
-            // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
-            size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
-            size_t estimatedMBSize = tunedMBSize * numParallelSequences; 
-            numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);             
+            if (m_maxSamplesInRAM < SIZE_MAX)
+            {
+                // into how many pieces would we need to break the minibatch?
+                // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
+                size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
+                size_t estimatedMBSize = tunedMBSize * numParallelSequences;
+                numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
+            }
+            if (m_numSubminiBatches > 1)
+            {
+                numSubminibatchesNeeded = m_numSubminiBatches;
+            }
         }
         // this is non-trivial, we need a manager object to handle this
         if (numSubminibatchesNeeded > 1)
@@ -800,7 +807,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         if (numSubminibatchesNeeded > 1)
         {
-            fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            if (m_maxSamplesInRAM < SIZE_MAX)
+                fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            else
+                fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
         }
         fprintf(stderr, ".\n");
 
@@ -2484,6 +2494,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
         m_truncated = configSGD(L"truncated", false);
         m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX);
+        m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1);
 
         // the number of samples in each epoch (0 means, use all the samples in each epoch).
         m_epochSize = configSGD(L"epochSize", (size_t)0);
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 15143dfa0208..b99608b500f8 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -157,7 +157,11 @@ struct SGDParams : public ScriptableObjects::Object
     // To mitigate this issue, we adopt the sub-minibatch implementation, where 
     // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM
     // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch 
-
+    size_t m_numSubminiBatches; 
+    // alternative method to specify how to split minibatches into subminibatches 
+    // default is 1, which means no subminibatch is used 
+    // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 
+    // we divide one minibatch to m_numSubminiBatches subMinibatches 
 
     // the number of samples in each epoch (0 means, use all the samples in each epoch).
     size_t m_epochSize;

From cba311ed72cd0876c9dca31cdc2880e1b6e79d1f Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 21 Dec 2015 21:56:33 -0800
Subject: [PATCH 07/49] Display CUB and CUDNN paths (if defined) in BuildInfo

Print BuildInfo at the very begining of the program. convenient for checking
build type.
---
 Source/CNTK/CNTK.cpp      |  8 ++++++--
 Source/CNTK/prebuild.bat  | 10 ++++++++++
 Tools/generate_build_info |  3 +++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index f2bd706bc773..d75d956e9c60 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -1667,6 +1667,9 @@ void PrintBuiltInfo()
 #ifdef _CUB_PATH_
     fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif 
+#ifdef _CUDNN_PATH_
+    fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+#endif
 #ifdef _GIT_EXIST
     fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
     fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
@@ -1885,7 +1888,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
         RedirectStdErr(logpath);
     }
 
-    PrintBuiltInfo();
+    PrintBuiltInfo(); // this one goes to log file 
     std::string timestamp = TimeDateStamp();
 
     //dump config info
@@ -1960,10 +1963,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
 // main wrapper that catches C++ exceptions and prints them
 // ---------------------------------------------------------------------------
 
-int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & repots Win32 exceptions
+int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
     try
     {
+        PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
         if (argc <= 1)
             InvalidArgument("No command-line argument given.");
         // detect legacy CNTK configuration
diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat
index 9f841d104da7..12631cf52e37 100644
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@@ -33,6 +33,16 @@ if "%cuda_path%" == "" (
         echo #define _CUDA_PATH_    "%cuda_path:\=\\%" >> buildinfo.h$$
     )
 
+if not "%cudnn_path%" == "" (
+    echo #define _CUDNN_PATH_  "%cudnn_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+if not "%cub_path%" == "" (
+    echo #define _CUB_PATH_  "%cub_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+
+
 echo #endif >> buildinfo.h$$
 
 ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time)
diff --git a/Tools/generate_build_info b/Tools/generate_build_info
index a155fc84e792..62686222ef33 100755
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@@ -56,6 +56,9 @@ makebuildinfo()
 	if [ ! -z "$CUB_PATH" ]; then 
 		printf "#define _CUB_PATH_ \"%s\"\n"  $CUB_PATH  >> $target
 	fi
+    if [ ! -z "$CUDNN_PATH" ]; then 
+        printf "#define _CUDNN_PATH_ \"%s\"\n"  $CUDNN_PATH  >> $target
+    fi
 	printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE    	>> 	$target
 	printf "#endif\n" 					>>	$target
 }

From 762a5dd80f780037c47245167e23021d7b07d807 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 21 Dec 2015 23:20:13 -0800
Subject: [PATCH 08/49] Move MarkValueNonsharable out of constuctors

(make gcc happy)
---
 Source/ComputationNetworkLib/CompositeComputationNodes.h | 6 ++----
 Source/ComputationNetworkLib/ComputationNode.h           | 3 +--
 Source/ComputationNetworkLib/EsotericNodes.h             | 6 ++----
 Source/ComputationNetworkLib/InputAndParamNodes.h        | 4 +---
 Source/ComputationNetworkLib/RecurrentNodes.h            | 3 +--
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 8b11e37233e0..6d983a9784fe 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -233,8 +233,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void MarkComputed(const bool hasComputed)
         {
             m_hasComputed = hasComputed;
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
         }
 
         virtual bool RequiresPreCompute() const override { return true; }
@@ -293,8 +292,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
         virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
         {
-            //CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
         }
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 4e59773afbbf..62cc38bb56f9 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -804,8 +804,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Since the dimensions are read as well, this function also updates m_numRows/m_numCols.
         void LoadValue(File& fstream)
         {
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable(); 
+            CreateMatrixIfNull(m_value);
             fstream >> Value();
             // above reads dimensions, so we must update our own m_numRows/m_numCols
             m_numRows = Value().GetNumRows();
diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h
index 9e7c7517dc34..85b6ca8da8e4 100644
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@@ -653,8 +653,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         void Init(size_t row_size, size_t col_size)
         {
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
             SetDims(row_size, col_size);
             UpdateFunctionValuesSize();
         }
@@ -664,8 +663,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             Init(row_size, col_size);
-            //CreateMatrixIfNull(m_gradient);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_gradient);
             m_gradient->Resize(row_size, col_size);
             m_gradient->SetValue(0.0f);
         }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index eb4b2f1bf3fe..778f68a28924 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -48,8 +48,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_parameterUpdateRequired = true;
             m_sampleLayout = ImageLayoutWHC(1, rows, 1);
             // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors.
-            //CreateMatrixIfNull(m_value); 
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value); 
             SetDims(rows, cols);
             UpdateFunctionValuesSize();   // this allocates the matrix
             Value().SetValue(0);
@@ -237,7 +236,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(size_t rows, size_t cols, bool isSparse)
         {
             m_isSparse = isSparse;
-            //CreateMatrixIfNull(m_value);
             MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index e97a79e70c68..844bf237e7ff 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -90,8 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             m_initialActivationValue = initialActivationValue;
             m_timeStep = 1;
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
             SetDims(row_size, col_size);
             m_isHistoryCarryOverManagedExternally = false;      // used for PairNetworkNode/PastValueNode combination
         }

From 1ca02625785add9f621463620424029c16cb50d3 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 22 Dec 2015 00:19:22 -0800
Subject: [PATCH 09/49] (further remove MarkValueNotSharable out of
 constructor)

---
 Source/ComputationNetworkLib/InputAndParamNodes.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 778f68a28924..8948d8b36d03 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -236,7 +236,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(size_t rows, size_t cols, bool isSparse)
         {
             m_isSparse = isSparse;
-            MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();
 

From 067bc561d72c49eea9ce6b7ef652763bbbc041b3 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 22 Dec 2015 12:18:21 -0800
Subject: [PATCH 10/49] (Fix a bug in MarkValueSharable)

---
 Source/ComputationNetworkLib/InputAndParamNodes.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 8948d8b36d03..125e572a22fc 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -40,12 +40,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             m_parameterUpdateRequired = true;
+            m_valueSharable = false; 
             m_sampleLayout = ImageLayoutWHC(1, SIZE_MAX, 1);
         }
         LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
             Base(deviceId, name)
         {
             m_parameterUpdateRequired = true;
+            m_valueSharable = false; 
             m_sampleLayout = ImageLayoutWHC(1, rows, 1);
             // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors.
             CreateMatrixIfNull(m_value); 
@@ -236,12 +238,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(size_t rows, size_t cols, bool isSparse)
         {
             m_isSparse = isSparse;
+            CreateMatrixIfNull(m_value);
             if (isSparse)
                 ConvertToSparseMatrix();
 
             SetDims(rows, cols);
             UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
             m_parameterUpdateRequired = false;
+            m_valueSharable = false; 
         }
     protected:
         InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) :

From 3f987c03fd5342dee38a3b728bf466bec0b193b7 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 28 Dec 2015 15:51:08 -0800
Subject: [PATCH 11/49] Add an option "prefixPathInToc" in HTKMLFReader

This options allows to specify data path relative to those in the TOC files.
---
 Source/Common/Include/latticearchive.h       | 23 +++++++++++++++-----
 Source/Common/Include/latticesource.h        | 11 ++++++++--
 Source/Readers/HTKMLFReader/HTKMLFReader.cpp |  7 ++++--
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h
index d1411396d372..f034c11ced9d 100644
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@@ -1016,6 +1016,8 @@ class archive
     // set of lattice archive files referenced
     // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
     std::vector<std::wstring> archivepaths;         // [archiveindex] -> archive path
+    std::wstring              prefixPathInToc;      // prefix path in a toc; using this to avoid pushd some path before start training 
+    mutable int               verbosity;            
     size_t getarchiveindex (const std::wstring & path)  // get index of a path in archivepaths[]; create new entry if needed
     {
         auto iter = std::find (archivepaths.begin(), archivepaths.end(), path);
@@ -1042,7 +1044,8 @@ class archive
         {   // need to read the map and establish the mapping
             // get the symlist file
             const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist";
-            fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
+            if (verbosity>0)
+                fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
             std::vector<char> textbuffer;
             auto lines = msra::files::fgetfilelines (symlistpath, textbuffer);
             // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
@@ -1092,19 +1095,25 @@ class archive
 public:
     // construct = open the archive
     //archive() : currentarchiveindex (SIZE_MAX) {}
-
+    void setverbosity(int veb) const 
+    {
+        verbosity = veb;
+    }
     // test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode)
     bool empty() const { return archivepaths.empty(); }
 
     // construct from a list of TOC files
-    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap)
+    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap, const std::wstring prefixPath=L"") 
+        : currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0)
     {
         if (tocpaths.empty())   // nothing to read--keep silent
             return;
         fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str());
+        size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1; 
         foreach_index (i, tocpaths)
         {
-            fprintf (stderr, ".");
+            if ( (i % onepercentage) ==  0)
+                fprintf (stderr, ".");
             open (tocpaths[i]);
         }
         fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size());
@@ -1135,7 +1144,11 @@ class archive
                 RuntimeError("open: invalid TOC line (no [): %s", line);
             if (q != p)
             {
-                const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                if (!prefixPathInToc.empty())
+                {
+                    archivepath = prefixPathInToc + L"/" + archivepath;
+                }
                 // TODO: should we allow paths relative to TOC file?
                 archiveindex = getarchiveindex (archivepath);
             }
diff --git a/Source/Common/Include/latticesource.h b/Source/Common/Include/latticesource.h
index fcf046b68908..0ec12508e9ca 100644
--- a/Source/Common/Include/latticesource.h
+++ b/Source/Common/Include/latticesource.h
@@ -23,10 +23,11 @@ class latticepair : public std::pair<msra::lattices::lattice, msra::lattices::la
 class latticesource
 {
     const msra::lattices::archive numlattices, denlattices;
+    int verbosity; 
 public:
     typedef msra::dbn::latticepair latticepair;
-    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap)
-        : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {}
+    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap, std::wstring RootPathInToc)
+        : numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {}
 
     bool empty() const
     {
@@ -52,6 +53,12 @@ class latticesource
         denlattices.getlattice (key, LP->second, expectedframes);     // this loads the lattice from disk, using the existing L.second object
         L = LP;
     }
+
+    void setverbosity(int veb)
+    {
+        verbosity = veb; 
+        numlattices.setverbosity(veb); denlattices.setverbosity(veb);
+    }
 };
 
 }}
\ No newline at end of file
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
index 0db717a998f2..344883db85be 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             vector<wstring> scriptpaths;
             vector<wstring> RootPathInScripts; 
+            wstring         RootPathInLatticeTocs;
             vector<wstring> mlfpaths;
             vector<vector<wstring>>mlfpathsmulti;
             size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
@@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     expand_wildcards(thisLattice(L"numLatTocFile"), paths);
                     latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end());
                 }
-
+                RootPathInLatticeTocs = thisLattice(L"prefixPathInToc",L"");
             }
 
             //get HMM related file names
@@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (!_wcsicmp(readMethod.c_str(), L"blockRandomize"))
             {
                 // construct all the parameters we don't need, but need to be passed to the constructor...
-                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap()));
+                
+                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs));
+                m_lattices->setverbosity(m_verbosity);
 
                 // now get the frame source. This has better randomization and doesn't create temp files
                 m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode));

From edae2da54d99542657ea7f948d48c5b6bd8f85e6 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 28 Dec 2015 16:44:23 -0800
Subject: [PATCH 12/49] Make lattice stats printf controlled by trace in READER

---
 Source/Common/Include/latticearchive.h        | 30 +++++++++++++------
 .../parallelforwardbackward.cpp               |  4 +--
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h
index f034c11ced9d..ca489ad5cd35 100644
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@@ -51,6 +51,7 @@ enum mbrclassdefinition     // used to identify definition of class in minimum b
 // ===========================================================================
 class lattice
 {
+    mutable int verbosity; 
     struct header_v1_v2
     {
         size_t numnodes : 32;
@@ -567,11 +568,13 @@ class lattice
         std::vector<size_t> backptroffsets;         // TODO: we could change this to 'unsigned int' to save some transfer time
         std::vector<unsigned short> backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer
         size_t numofstates;                         // per sil hmm
+        int verbosity;  
     public:
-        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0)
+        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0)
         {
             size_t edgeswithsilence = 0;    // (diagnostics only: number of edges with at least one /sil/)
             size_t backptrbufsize = 0;      // number of entries in buffer for silence backpointer array, used as cursor as we build it
+            
             backptroffsets.resize (L.edges.size() + 1);  // +1, so that the final entry determines the overall size of the allocated buffer
             const size_t silUnitId = hset.gethmmid ("sil");
             numofstates = hset.gethmm (silUnitId).getnumstates();
@@ -595,15 +598,18 @@ class lattice
 #if 1           // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
                 if (numsilunits > 1)
                 {
-                    fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
-                    fprintf (stderr, "alignments: :");
-                    foreach_index (a, aligntokens)
+                    if (verbosity)
                     {
-                        const auto & unit = aligntokens[a];
-                        const auto & hmm = hset.gethmm (unit.unit);
-                        fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
+                        fprintf(stderr, "alignments: :");
+                        foreach_index(a, aligntokens)
+                        {
+                            const auto & unit = aligntokens[a];
+                            const auto & hmm = hset.gethmm(unit.unit);
+                            fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        }
+                        fprintf(stderr, "\n");
                     }
-                    fprintf (stderr, "\n");
                 }
 #endif
                 if (numsilunits > 0)
@@ -611,7 +617,8 @@ class lattice
                 backptrbufsize += maxsilframes * numofstates;
             }
             backptroffsets[L.edges.size()] = backptrbufsize;        // (TODO: remove if not actually needed)
-            fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
+            if (verbosity)
+                fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
         }
         // CUDA support
         const std::vector<size_t> & getbackptroffsets() const { return backptroffsets; }
@@ -1002,6 +1009,10 @@ class lattice
 
     std::wstring key;        // (keep our own name (key) so we can identify ourselves for diagnostics messages)
     const wchar_t * getkey() const { return key.c_str(); }
+
+    void setverbosity(int veb) const{
+        verbosity = veb;
+    }
 };
 
 // ===========================================================================
@@ -1220,6 +1231,7 @@ class archive
             fsetpos (f, offset);
             // get it
             L.fread (f, idmap, spunit);
+            L.setverbosity(verbosity);
 #ifdef HACK_IN_SILENCE       // hack to simulate DEL in the lattice
             const size_t silunit = getid (modelsymmap, "sil");
             const bool addsp = true;
diff --git a/Source/SequenceTrainingLib/parallelforwardbackward.cpp b/Source/SequenceTrainingLib/parallelforwardbackward.cpp
index 3fb27b59fba1..bc4baaad9d8f 100644
--- a/Source/SequenceTrainingLib/parallelforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/parallelforwardbackward.cpp
@@ -743,8 +743,8 @@ namespace msra { namespace lattices {
         double totalfwscore = 0.0f;
         if (!parallelstate->emulation)
         {
-
-            fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
+            if (verbosity>=2)
+                fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
 
             const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
             const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);

From e3757f0054b59a907bdace1fe8140a8ee0749c21 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 4 Jan 2016 12:26:53 -0800
Subject: [PATCH 13/49] A stopgap to prevent reader to load matrices
 inconsistent with lattices. Will be gone once the bug is fixed.

---
 Source/Readers/HTKMLFReader/HTKMLFReader.cpp | 94 ++++++++++++--------
 Source/Readers/HTKMLFReader/HTKMLFReader.h   |  3 +
 2 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
index 344883db85be..239409bb6528 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@@ -944,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (!skip)
                         {
+                            // a stopgap 
+                            if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i])
+                            {
+                                // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                // This is just a stopgap, to be removed after the bugs are found and fixed
+                                bool needRenew = true; 
+                                while (needRenew)
+                                {
+                                    size_t framenum = m_numFramesToProcess[i];
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        framenum, m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
+                                    ReNewBufferForMultiIO(i);
+                                    needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; 
+                                }
+
+                            }
                             m_numValidFrames[i] = m_numFramesToProcess[i];
                             if (m_numValidFrames[i] > 0)
                             {
@@ -975,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         m_extraNumSeqs = 0;
                         if (!m_frameMode)
                         {
-                            // insert extra utterances to parallel sequences that have enough space left
-                            // As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in.
-                            size_t nextMinibatchUttnum = 0;
-                            bool inserted;
-                            // The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB.
-                            // For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j].
-                            for (size_t i = 0; i < m_numSeqsPerMB; i++)
+                            for (size_t src = 0; src < m_numSeqsPerMB; )
                             {
-                                while (nextMinibatchUttnum <= i)
+                                size_t framenum = m_numFramesToProcess[src];
+                                if (framenum == 0)
                                 {
-                                    size_t framenum = m_numFramesToProcess[i];
-                                    inserted = false;
-                                    if (framenum > 0)       // non-empty entry: see were it fits
-                                    {
-                                        // greedily search for a parallel sequence with enough space at the end to insert this utterance
-                                        for (size_t j = 0; j < m_numSeqsPerMB; j++)
+                                    src++;
+                                    continue;
+                                }
+                                if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum)
+                                {
+                                    // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                    // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                    // This is just a stopgap, to be removed after the bugs are found and fixed
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        framenum, m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
+                                    src++;
+                                    continue;
+                                }
+
+                                bool slotFound = false;
+                                for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot
+                                {
+                                    if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps)
+                                    { // found !
+                                        m_extraSeqsPerMB.push_back(des);
+                                        if (m_latticeBufferMultiUtt[src] != nullptr)
                                         {
-                                            if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps)
-                                            {
-                                                // enough space: insert it as parallel sequence [j] (instead of [i] in the next MB)
-                                                m_extraSeqsPerMB.push_back(j);
-                                                if (m_latticeBufferMultiUtt[i] != nullptr)
-                                                {
-                                                    m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]);
-                                                    m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]);
-                                                    m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]);
-                                                }
-                                                fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i);
-                                                m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum);
-
-                                                // consume it
-                                                ReNewBufferForMultiIO(i);       // replace current [i] with a new one; then try again with this new one at [i]
-                                                m_numValidFrames[j] += framenum;
-                                                m_extraNumSeqs++;
-                                                inserted = true;
-                                                break;
-                                            }
+                                            m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]);
+                                            m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]);
+                                            m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]);
                                         }
+                                        fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src);
+                                        m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum);
+
+                                        ReNewBufferForMultiIO(src);
+                                        m_numValidFrames[des] += framenum;
+                                        m_extraNumSeqs++;
+                                        slotFound = true;
+                                        break;
                                     }
-                                    if (!inserted)
-                                    {
-                                        nextMinibatchUttnum++;  // didn't fit anywhere: done with entry [i]
-                                    }
+                                }
+                                if (!slotFound)
+                                {
+                                    src++; // done with this source;  try next source;
                                 }
                             }
 
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.h b/Source/Readers/HTKMLFReader/HTKMLFReader.h
index fd6015c28383..7e64ee3e8f5d 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.h
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h
@@ -32,6 +32,9 @@ class HTKMLFReader : public IDataReader<ElemType>
     intargvector m_numSeqsPerMBForAllEpochs;
     size_t m_numSeqsPerMB;                  // requested number of parallel sequences
     size_t m_mbNumTimeSteps;                // number of time steps  to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported)
+    size_t m_mbMaxNumTimeSteps;             // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued )
+                                            // this is used to prevent CUDA out-of memory errors
+
     vector<size_t> m_numFramesToProcess;    // [seq index] number of frames available (left to return) in each parallel sequence
     vector<size_t> m_switchFrame;           /// TODO: something like the position where a new sequence starts; still supported?
     vector<size_t> m_numValidFrames;        // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput.

From 8f7f19333c7fdb2c38429fb94c1ab6f951e21866 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Fri, 8 Jan 2016 14:23:36 -0800
Subject: [PATCH 14/49] (make gcc happy)

---
 Source/Common/Include/Sequences.h                 | 2 +-
 Source/ComputationNetworkLib/InputAndParamNodes.h | 6 +++---
 Source/Readers/HTKMLFReader/HTKMLFReader.cpp      | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 0b9824e84233..9b92e71d61bf 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // construction
         // -------------------------------------------------------------------
 
-        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
+        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_columnsValidityMask(CPUDEVICE), m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
         MBLayout() : MBLayout(1, 0) { }
 
         // copy the content of another MBLayoutPtr over
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 125e572a22fc..4e86ebbb72e9 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -40,14 +40,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             m_parameterUpdateRequired = true;
-            m_valueSharable = false; 
+            this->m_valueSharable = false; 
             m_sampleLayout = ImageLayoutWHC(1, SIZE_MAX, 1);
         }
         LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) :
             Base(deviceId, name)
         {
             m_parameterUpdateRequired = true;
-            m_valueSharable = false; 
+            this->m_valueSharable = false; 
             m_sampleLayout = ImageLayoutWHC(1, rows, 1);
             // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors.
             CreateMatrixIfNull(m_value); 
@@ -245,7 +245,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             SetDims(rows, cols);
             UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
             m_parameterUpdateRequired = false;
-            m_valueSharable = false; 
+            this->m_valueSharable = false; 
         }
     protected:
         InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) :
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
index 239409bb6528..ecc6283f615d 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@@ -264,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     expand_wildcards(thisLattice(L"numLatTocFile"), paths);
                     latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end());
                 }
-                RootPathInLatticeTocs = thisLattice(L"prefixPathInToc",L"");
+                RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L"");
             }
 
             //get HMM related file names
@@ -955,7 +955,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                 {
                                     size_t framenum = m_numFramesToProcess[i];
                                     fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
-                                        framenum, m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
                                     ReNewBufferForMultiIO(i);
                                     needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; 
                                 }
@@ -1006,7 +1006,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                     // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
                                     // This is just a stopgap, to be removed after the bugs are found and fixed
                                     fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
-                                        framenum, m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
                                     src++;
                                     continue;
                                 }

From fc3361438fc81add21110cf38182241073c302a6 Mon Sep 17 00:00:00 2001
From: RuiZhao <ruzhao@microsoft.com>
Date: Fri, 18 Dec 2015 15:17:48 -0800
Subject: [PATCH 15/49] frameskip SE

---
 Source/Math/latticefunctionskernels.h         | 25 +++++++++++++++++--
 Source/SequenceTrainingLib/gammacalculation.h |  4 +--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
index 876e3c6a867e..5053b271690b 100644
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@@ -358,6 +358,7 @@ struct latticefunctionskernels
             size_t state1step0to1 = te;                     // inflection point from state 0 to 1, record in state 1
             size_t state2step0to1 = te;                     // inflection point from state 0 to 1, record in state 2
             size_t state2step1to2 = te;                     // inflection point from state 1 to 2, record in state 2
+			size_t state2step0to2 = te;
 
             //now we only support transition from -1 to 0 or 2 for sil
             float pathscore0 = fwscore ;                     // log pp in state 0
@@ -400,16 +401,20 @@ struct latticefunctionskernels
                         pathscore2 = pathscore12;
                         state2step0to1 = state1step0to1;                                        // record the inflection point
                         state2step1to2 = t;                                                     // record the inflection point
+						state2step0to2 = te;
                         if (isSil)
                             backptrmatrix (2, t-ts-1) = 1;
                     }
-                    if (isSil)                                                                  // only silence have path from 0 to 2
+                    //if (isSil)                                                                  // only silence have path from 0 to 2
                     {
                         const float pathscore02 = pathscore0 + getlogtransp(transP,0,2);          // log pp from state 0 to 2
                         if (pathscore02 >= pathscore2)                                          // if state 0->2
                         {
                             pathscore2 = pathscore02;
-                            backptrmatrix (2, t-ts-1) = 0;
+                            if (isSil)
+                            	backptrmatrix (2, t-ts-1) = 0;
+							state2step0to2 = t;		
+							state2step1to2 = te;
                         }
                     }
 
@@ -494,6 +499,21 @@ struct latticefunctionskernels
             // emit alignment
 
             if (!isSil)
+            {
+				if (state2step0to2 < te)
+				{
+					state2step0to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+						if (t < state2step0to2)                                     // in state 0
+							senoneid = senoneid0;						
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short)senoneid;
+					}
+				}
+				else
             {
                 state2step0to1 += alignindex - ts;                              // convert to align measure
                 state2step1to2 += alignindex - ts;
@@ -509,6 +529,7 @@ struct latticefunctionskernels
                     alignresult[t] = (unsigned short) senoneid;
                 }
             }
+            }
             else                                                                        // for silence
             {
                 size_t lastpointer = 2;
diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h
index 4ad7d8f46c63..f8a60e5bd478 100644
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@@ -19,9 +19,9 @@ namespace msra { namespace lattices {
         GammaCalculation() : cpumode(false)
         {
             initialmark = false;
-            lmf = 14.0f; // Note that 9 was best for Fisher  --these should best be configurable
+            lmf = 7.0f; // Note that 9 was best for Fisher  --these should best be configurable
             wp = 0.0f;
-            amf = 14.0f;
+            amf = 7.0f;
             boostmmifactor = 0.0f;
             seqsMBRmode = false;
         }

From ef84011f1619a8b0be24a3debc7d43d312c74100 Mon Sep 17 00:00:00 2001
From: RuiZhao <ruzhao@microsoft.com>
Date: Wed, 23 Dec 2015 11:39:39 -0800
Subject: [PATCH 16/49] SE frameskip V2 temp

---
 Source/Math/latticefunctionskernels.h         | 65 ++++++++++++-------
 .../latticeforwardbackward.cpp                |  3 +-
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
index 5053b271690b..ec164960e1b5 100644
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@@ -356,27 +356,39 @@ struct latticefunctionskernels
             const size_t te = ts + numframes;               // end time of current unit
 
             size_t state1step0to1 = te;                     // inflection point from state 0 to 1, record in state 1
+			size_t state1stepm1to1 = te;
             size_t state2step0to1 = te;                     // inflection point from state 0 to 1, record in state 2
+			size_t state2stepm1to1 = te;                     // inflection point from state 0 to 1, record in state 2
             size_t state2step1to2 = te;                     // inflection point from state 1 to 2, record in state 2
 			size_t state2step0to2 = te;
 
             //now we only support transition from -1 to 0 or 2 for sil
-            float pathscore0 = fwscore ;                     // log pp in state 0
-            float pathscore1 = LOGZERO;                     // log pp in state 1
-            float pathscore2 = LOGZERO;                     // log pp in state 2
-            if(isSil)
-                pathscore2 = fwscore;                    
+			float pathscore0 = fwscore;                     // log pp in state 0
+			float pathscore1 = fwscore;                     // log pp in state 1
+			float pathscore2 = fwscore;                     // log pp in state 2
+            
                 
+
             // first frame
             if (ts != te)                                                              // for t = ts, initialization
             {                           
-                if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
+            /*    if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
                 {
                     pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts); 
                     pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts);      
                 }
-                else                                                                    //for others, only -1 to 0 is permitted
-                    pathscore0 +=  logLLs(senoneid0,ts);                                // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway
+				else                                                                    //for others, only -1 to 0 is permitted
+				{
+					pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);                                
+					pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);                                
+
+				}*/
+				pathscore2 = getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
+				pathscore1 = getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
+				state1stepm1to1 = ts;
+				pathscore0 = getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
+				
+
             }
             
             
@@ -400,6 +412,7 @@ struct latticefunctionskernels
                     {
                         pathscore2 = pathscore12;
                         state2step0to1 = state1step0to1;                                        // record the inflection point
+						state2stepm1to1 = state1stepm1to1;
                         state2step1to2 = t;                                                     // record the inflection point
 						state2step0to2 = te;
                         if (isSil)
@@ -427,9 +440,11 @@ struct latticefunctionskernels
                     {
                         pathscore1 = pathscore01;
                         state1step0to1 = t;                                                     // record the inflection point
+						state1stepm1to1 = te;
                         if (isSil)
                             backptrmatrix (1, t-ts-1) = 0;
                     }
+					
                     if (isSil)                                                                  // only silence have path from 2 to 1
                     {
                         const float pathscore21 = pathscore2last + getlogtransp(transP,2,1); 
@@ -500,7 +515,7 @@ struct latticefunctionskernels
 
             if (!isSil)
             {
-				if (state2step0to2 < te)
+				if (state2step0to2 < te)     //from 0 to 2
 				{
 					state2step0to2 += alignindex - ts;
 					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
@@ -513,22 +528,22 @@ struct latticefunctionskernels
 						alignresult[t] = (unsigned short)senoneid;
 					}
 				}
-				else
-            {
-                state2step0to1 += alignindex - ts;                              // convert to align measure
-                state2step1to2 += alignindex - ts;
-                for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
-                {
-                    size_t senoneid;
-                    if (t < state2step0to1)                                     // in state 0
-                        senoneid = senoneid0;
-                    else if(t < state2step1to2)                                 // in state 1
-                        senoneid = senoneid1;
-                    else                                                        // in state 2
-                        senoneid = senoneid2;
-                    alignresult[t] = (unsigned short) senoneid;
-                }
-            }
+				else          //from 1 to 2
+				{
+					state2step0to1 += alignindex - ts;                              // convert to align measure
+					state2step1to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+						if (state2step0to1 < te && t < state2step0to1)						
+							senoneid = senoneid0;
+						else if(t < state2step1to2)                                 // in state 1
+							senoneid = senoneid1;
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short) senoneid;
+					}
+				}
             }
             else                                                                        // for silence
             {
diff --git a/Source/SequenceTrainingLib/latticeforwardbackward.cpp b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
index 0f55d5ba0ff9..942f6adffa8d 100644
--- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
@@ -438,6 +438,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
                 LogicError("invalid backpointer resulting in state index out of range");
 
             int bp = (int) backpointers(j,t);   // save the backpointer before overwriting it (gammas and backpointers are aliases of each other)
+			thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
             if (!returnsenoneids)               // return binary gammas (for MMI; this mode is compatible with softalignmode)
                 for (size_t i = js; i < je; i++)
                     loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO;
@@ -784,7 +785,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
     //  - per-edge acoustic scores
     const size_t silunitid = hset.gethmmid("sil");      // shall be the same as parallelstate.getsilunitid()
     bool parallelsil = true;
-    bool cpuverification = false;
+    bool cpuverification = true;
 
 #ifndef PARALLEL_SIL                                     // we use a define to make this marked
     parallelsil = false;

From 1ad6c2d4192d702309b982d858f7f91d86be9a04 Mon Sep 17 00:00:00 2001
From: RuiZhao <ruzhao@microsoft.com>
Date: Thu, 24 Dec 2015 13:48:21 -0800
Subject: [PATCH 17/49] frameskipv2

---
 Source/Math/latticefunctionskernels.h                 | 8 ++++----
 Source/SequenceTrainingLib/latticeforwardbackward.cpp | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
index ec164960e1b5..e30f35582236 100644
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@@ -383,10 +383,10 @@ struct latticefunctionskernels
 					pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);                                
 
 				}*/
-				pathscore2 = getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
-				pathscore1 = getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
+				pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
+				pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
 				state1stepm1to1 = ts;
-				pathscore0 = getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
+				pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
 				
 
             }
@@ -535,7 +535,7 @@ struct latticefunctionskernels
 					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
 					{
 						size_t senoneid;
-						if (state2step0to1 < te && t < state2step0to1)						
+                        if (state2step0to1 <alignindex - ts + te && t < state2step0to1)
 							senoneid = senoneid0;
 						else if(t < state2step1to2)                                 // in state 1
 							senoneid = senoneid1;
diff --git a/Source/SequenceTrainingLib/latticeforwardbackward.cpp b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
index 942f6adffa8d..1efa300ae23e 100644
--- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
@@ -438,7 +438,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
                 LogicError("invalid backpointer resulting in state index out of range");
 
             int bp = (int) backpointers(j,t);   // save the backpointer before overwriting it (gammas and backpointers are aliases of each other)
-			thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
+			//thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
             if (!returnsenoneids)               // return binary gammas (for MMI; this mode is compatible with softalignmode)
                 for (size_t i = js; i < je; i++)
                     loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO;
@@ -785,7 +785,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate,
     //  - per-edge acoustic scores
     const size_t silunitid = hset.gethmmid("sil");      // shall be the same as parallelstate.getsilunitid()
     bool parallelsil = true;
-    bool cpuverification = true;
+    bool cpuverification = false;
 
 #ifndef PARALLEL_SIL                                     // we use a define to make this marked
     parallelsil = false;

From a1427cc3cfc026c1171bc4020896ca1e8279411b Mon Sep 17 00:00:00 2001
From: RuiZhao <ruzhao@microsoft.com>
Date: Tue, 29 Dec 2015 15:00:09 -0800
Subject: [PATCH 18/49] release temp matrix in SE

---
 .../TrainingCriterionNodes.h                    | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index a34a92c66c04..361e57472f4b 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -1292,8 +1292,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             else if (inputIndex == 1)
             {
-                BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
-                                         Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				FrameRange fr(Input(0)->GetMBLayout());
+				BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
+					Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr);
+                
 #ifdef _DEBUG
                 Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout()));
 #endif
@@ -1433,6 +1436,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
             RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
         }
+
+		//request matrices needed to do node function value evaluation
+		virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
+		{
+			Base::ReleaseMatricesAfterForwardProp(matrixPool);
+			ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
+		}
+
         // TODO: method names should be CamelCase
         std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
         {

From 5499741fde4789a11bd93f148b2f37b955f52b48 Mon Sep 17 00:00:00 2001
From: RuiZhao <ruzhao@microsoft.com>
Date: Wed, 6 Jan 2016 16:48:10 -0800
Subject: [PATCH 19/49] release after BP

---
 Source/ComputationNetworkLib/TrainingCriterionNodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index 361e57472f4b..07190640cb70 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -1438,7 +1438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
 		//request matrices needed to do node function value evaluation
-		virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
+		virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
 		{
 			Base::ReleaseMatricesAfterForwardProp(matrixPool);
 			ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);

From c41eafa4f8c09ec6d9dad91be350dd65d7598464 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Sat, 9 Jan 2016 23:16:45 -0800
Subject: [PATCH 20/49] Move all data member in MBLayout to CPU

---
 Source/Common/Include/Sequences.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 0b9824e84233..9b92e71d61bf 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // construction
         // -------------------------------------------------------------------
 
-        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
+        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_columnsValidityMask(CPUDEVICE), m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
         MBLayout() : MBLayout(1, 0) { }
 
         // copy the content of another MBLayoutPtr over

From 05e0262bf1f305d9fb19da9082acdc8ba8307f08 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Sat, 9 Jan 2016 23:17:51 -0800
Subject: [PATCH 21/49] Bug fix for ConvertDBN command

---
 Source/CNTK/SimpleNetworkBuilder.cpp                     | 8 ++++----
 Source/ComputationNetworkLib/CompositeComputationNodes.h | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Source/CNTK/SimpleNetworkBuilder.cpp b/Source/CNTK/SimpleNetworkBuilder.cpp
index 726c4a86220d..4e1ff488722f 100644
--- a/Source/CNTK/SimpleNetworkBuilder.cpp
+++ b/Source/CNTK/SimpleNetworkBuilder.cpp
@@ -2419,9 +2419,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu"));
             assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);
 
-            w = builder.Mean(label, L"Prior");
-            static_pointer_cast<PreComputedNode<ElemType>>(w)->SideLoadFromMatrix(priorVals);
-            w->SetParameterUpdateRequired(false);
+            prior = builder.Mean(label, L"Prior");
+            static_pointer_cast<PreComputedNode<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
+            prior->SetParameterUpdateRequired(false);
         }
         else // pretrained network - need to add output layer, initalize
         {
@@ -2461,7 +2461,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         if (layerType == "perceptron" || m_needPrior)
         {
-            input = builder.Log(pcNodePtr, L"LogOfPrior");
+            input = builder.Log(prior, L"LogOfPrior");
 
             //following two lines is needed only if true probability is needed
             //output = builder.Softmax(output);
diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 6d983a9784fe..9bdd6f38ce56 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -295,6 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
+            SetDims(value.GetNumRows(), value.GetNumCols());
         }
     public:
         bool m_hasComputed;

From 569a4d6c21bed92b688910eab853df5daf295f18 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 11 Jan 2016 16:32:19 -0800
Subject: [PATCH 22/49] Add support for revising batch normalization property
 in MEL.

Now a BatchNormalization node's eval mode can be modified by

SetProperty(BNnode, batchNormEvalMode, true);

or by

SetPropertyForSubTree(rootNode, batchNormEvalMode, true);

in which all the BN nodes under rootNode will be changed.
---
 Source/CNTK/ModelEditLanguage.cpp             | 45 ++++++++++++++++++-
 .../ComputationNetwork.h                      |  3 +-
 .../ComputationNetworkEditing.cpp             | 38 ++++++++++++++++
 .../ConvolutionalNodes.h                      |  4 ++
 Source/Math/latticefunctionskernels.h         | 10 ++---
 5 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index 981b63ffd53f..94a1dc185fb7 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -9,6 +9,7 @@
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
 
 #include "ModelEditLanguage.h"
+#include "ConvolutionalNodes.h"
 #include <map>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
@@ -56,7 +57,8 @@ enum MELProperty
     melPropFinalCriterion,
     melPropEvaluation,
     melPropOutput,
-    melPropRecurrent
+    melPropRecurrent, 
+    melPropBatchNormMode, 
 };
 
 // SetProperty - Set the Property on the passed node
@@ -420,6 +422,10 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         {
             prop = melPropEvaluation;
         }
+        else if (EqualInsensitive(propName, "batchNormEvalMode"))
+        {
+            prop = melPropBatchNormMode; 
+        }
         else if (EqualInsensitive(propName, "output"))
         {
             prop = melPropOutput;
@@ -485,6 +491,33 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                     // what to do here?
                     break;
                 }
+                case melPropBatchNormMode:
+                {
+                    if (node->OperationName() != OperationNameOf(BatchNormalizationNode))
+                    {
+                        RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.",
+                            node->NodeName().c_str(), 
+                            node->OperationName().c_str(), 
+                            OperationNameOf(BatchNormalizationNode).c_str()); 
+                    }
+                    bool property = params[2]; 
+                    auto pnode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node); 
+                    if (pnode)
+                    {
+                        pnode->SetEvalMode(property); 
+                    }
+                    else
+                    {
+                        auto pnode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node); 
+                        if (pnode2)
+                            pnode2->SetEvalMode(property);
+                        else
+                        {
+                            RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
+                        }
+                    }
+                    break; 
+                }
                 default:
                 {
                     RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@@ -505,6 +538,10 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         {
             prop = melPropComputeGradient;
         }
+        if (EqualInsensitive(propName, "batchNormEvalMode"))
+        {
+            prop = melPropBatchNormMode; 
+        }
         else
         {
             RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@@ -527,6 +564,12 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                     netNdl->cn->SetLearnableNodesBelowNeedGradient(needGradient, node);
                     break;
                 }
+                case melPropBatchNormMode:
+                {
+                    bool evalMode = params[2]; 
+                    netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node);
+                    break;
+                }
                 default:
                 {
                     RuntimeError("Invalid property, %s, is not supported", propName.c_str());
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 5b3a0f16adf5..8d6d5195ec65 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -346,7 +346,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     void ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode);
     void AddFeatureNode(ComputationNodeBasePtr featureNode);
     void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
-    void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr);
+    void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); 
+    void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
 
     // -----------------------------------------------------------------------
     // node access
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
index bdd1a063114d..a44e268c8907 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -10,6 +10,7 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "InputAndParamNodes.h"
+#include "ConvolutionalNodes.h"
 #include <string>
 #include <vector>
 #include <list>
@@ -314,4 +315,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
+    {
+        vector<ComputationNodeBasePtr>  nodes; 
+        if (rootNode == nullptr)
+        {
+            for (auto pair : m_nameToNodeMap)
+            {
+                nodes.push_back(pair.second); 
+            }
+        }
+        else
+        {
+            auto allnodes = rootNode->EnumerateNodes(true); 
+            for (auto node : allnodes)
+                nodes.push_back(node); 
+        }
+
+        for (auto& node: nodes)
+        {
+            if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
+            {
+                auto pNode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
+                if (!pNode)
+                {
+                    auto pNode2= dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
+                    if (!pNode2)
+                    {
+                         RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
+                    }
+                }
+                else
+                {
+                    pNode->SetEvalMode(evalMode);
+                }
+            }
+        }
+    }
 }}}
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index 8545f32f5e4d..45031d4722d1 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -740,6 +740,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 
+        void SetEvalMode(bool bnEvalMode)
+        {
+            m_eval = bnEvalMode; 
+        }
     private:
         struct VersionInfo
         {
diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
index e30f35582236..b2b7d4b08297 100644
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@@ -356,9 +356,9 @@ struct latticefunctionskernels
             const size_t te = ts + numframes;               // end time of current unit
 
             size_t state1step0to1 = te;                     // inflection point from state 0 to 1, record in state 1
-			size_t state1stepm1to1 = te;
+			//size_t state1stepm1to1 = te;
             size_t state2step0to1 = te;                     // inflection point from state 0 to 1, record in state 2
-			size_t state2stepm1to1 = te;                     // inflection point from state 0 to 1, record in state 2
+            //size_t state2stepm1to1 = te;                    // inflection point from state 0 to 1, record in state 2
             size_t state2step1to2 = te;                     // inflection point from state 1 to 2, record in state 2
 			size_t state2step0to2 = te;
 
@@ -385,7 +385,7 @@ struct latticefunctionskernels
 				}*/
 				pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
 				pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
-				state1stepm1to1 = ts;
+				//state1stepm1to1 = ts;
 				pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
 				
 
@@ -412,7 +412,7 @@ struct latticefunctionskernels
                     {
                         pathscore2 = pathscore12;
                         state2step0to1 = state1step0to1;                                        // record the inflection point
-						state2stepm1to1 = state1stepm1to1;
+						//state2stepm1to1 = state1stepm1to1;
                         state2step1to2 = t;                                                     // record the inflection point
 						state2step0to2 = te;
                         if (isSil)
@@ -440,7 +440,7 @@ struct latticefunctionskernels
                     {
                         pathscore1 = pathscore01;
                         state1step0to1 = t;                                                     // record the inflection point
-						state1stepm1to1 = te;
+						//state1stepm1to1 = te;
                         if (isSil)
                             backptrmatrix (1, t-ts-1) = 0;
                     }

From edbb47dc79d6dd8ed844f20ffa13be45c3d374c7 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 14 Dec 2015 17:33:58 -0800
Subject: [PATCH 23/49] Replace CreateMatrixIfNull by MarkValueNonsharable()

In the compiling the stage, we will mark nodes as nonsharable whose descendents are all learnable parameters.
---
 .../CompositeComputationNodes.h               |  6 ++--
 .../ComputationNetwork.h                      |  1 +
 .../ComputationNetworkEvaluation.cpp          | 29 +++++++++++++++++++
 .../ComputationNetworkLib/ComputationNode.h   | 23 +++++++++++++--
 Source/ComputationNetworkLib/EsotericNodes.h  |  3 +-
 .../InputAndParamNodes.h                      |  3 +-
 6 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index f8f79dc21642..5027ef6c2bd9 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -234,7 +234,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void MarkComputed(const bool hasComputed)
         {
             m_hasComputed = hasComputed;
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
         }
 
         virtual bool RequiresPreCompute() const override { return true; }
@@ -293,7 +294,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
         virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
         {
-            CreateMatrixIfNull(m_value);
+            //CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             m_value->SetValue(value);
             m_hasComputed = true; 
         }
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 26b78d8be05f..b533602786c5 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -159,6 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 private:
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
     void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode);
 private:
     void DetermineSetOfAllRoots();
     void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 084fe9ce9a69..0cdeec07f56e 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -413,6 +413,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (auto & node : m_allRoots)
             ValidateSubNetwork(node);
 
+        // STEP:  mark non-sharable function values 
+        // if all the descendants of a particular node are learnable parameters, 
+        // its function value is not sharable 
+        for (auto & node : m_allRoots)
+            MarkValueNonSharableNodes(node);
+       
+
         // STEP: Optimize the network.
         // :)
 
@@ -678,6 +685,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
+    void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
+    {
+        const auto & nodes = GetEvalOrder(rootNode);
+        for (auto& node : nodes)
+        {
+            auto children = node->GetInputs(); 
+            bool allChildrenNonSharable = true; 
+            for (auto& child : children)
+            {
+                if (child->isValueSharable())
+                {
+                    allChildrenNonSharable = false; 
+                    break;
+                }
+            }
+            if (allChildrenNonSharable)
+                node->MarkValueNonSharable(); 
+        }
+        
+    }
+
 #if 0
     // prepare to compute with the subnetwork that this rootNode depends on, including
     //  - auto-detecting recurrent loops
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index b4e0725bd7c2..198c1dcb6421 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -250,7 +250,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
             m_parameterUpdateRequired(false), m_gradientInitialized(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0)
+            m_numRows(0), m_numCols(0), m_valueSharable(true)
         { }
         virtual ~ComputationNodeBase(){}
 
@@ -455,6 +455,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
+        bool isValueSharable() 
+        {
+            return m_valueSharable; 
+        }
+        virtual void MarkValueNonSharable()
+        {
+            m_valueSharable = false; 
+        }
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
 
@@ -769,6 +777,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_parameterUpdateRequired;     // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
         bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
         bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
+
+        // flags related with sharable values 
+        bool    m_valueSharable; // whether value is sharable 
     };
     typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
@@ -815,7 +826,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Since the dimensions are read as well, this function also updates m_numRows/m_numCols.
         void LoadValue(File& fstream)
         {
-            CreateMatrixIfNull(m_value);
+            // CreateMatrixIfNull(m_value);
+            MarkValueNonSharable(); 
             fstream >> Value();
             // above reads dimensions, so we must update our own m_numRows/m_numCols
             SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
@@ -1317,6 +1329,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_gradient);
         }
 
+        void MarkValueNonSharable() override
+        {
+            m_valueSharable = false; 
+            CreateMatrixIfNull(m_value);
+        }
+
+
     protected:
 
         // this function is used to create matrices for those needed before matrix pool is available
diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h
index 3f500421cf1d..b5c6a6f46ee5 100644
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@@ -1550,7 +1550,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             Init(row_size, col_size);
-            CreateMatrixIfNull(m_gradient);
+            //CreateMatrixIfNull(m_gradient);
+            MarkValueNonSharable();
             m_gradient->Resize(row_size, col_size);
             m_gradient->SetValue(0.0f);
         }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 898d35f9f65f..f7096fa34efa 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -254,7 +254,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(const TensorShape & sampleLayout, bool isSparse)
         {
             m_isSparse = isSparse;
-            CreateMatrixIfNull(m_value);
+            //CreateMatrixIfNull(m_value);
+            MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();
 

From 485a7b8fe6056e4cbd4716bf06fabb9ce64241cb Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Wed, 16 Dec 2015 15:54:08 -0800
Subject: [PATCH 24/49] Revise the implementation of valueNotSharableNode. More
 to be revised.

---
 .../ComputationNetworkEvaluation.cpp          | 47 ++++++++++++++++---
 .../ComputationNetworkLib/ComputationNode.h   | 24 +++++-----
 .../InputAndParamNodes.h                      |  2 +
 3 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 0cdeec07f56e..03d085be8d2b 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -15,6 +15,7 @@
 #include <list>
 #include <set>
 #include <algorithm>
+#include <map>
 
 using namespace std;
 
@@ -689,20 +690,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
     {
         const auto & nodes = GetEvalOrder(rootNode);
+        std::map<wstring, bool>    allLeafDescendentsAreParameters; 
         for (auto& node : nodes)
         {
             auto children = node->GetInputs(); 
-            bool allChildrenNonSharable = true; 
-            for (auto& child : children)
+            wstring myname = node->NodeName();
+            bool allParameters = true; 
+                        
+            if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already 
             {
-                if (child->isValueSharable())
+                for (auto child : children)
                 {
-                    allChildrenNonSharable = false; 
-                    break;
+                    wstring ChildName = child->NodeName();
+                    if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
+                    {
+                        // not found, means it is a leaf node (we are at eval order )
+                        assert(child->IsLeaf());
+                        if (node->isLearnableParameter())
+                        {
+                            allLeafDescendentsAreParameters[ChildName] = true; 
+                        }
+                        else
+                        {
+                            allParameters = false; 
+                            allLeafDescendentsAreParameters[ChildName] = false;
+                            break;
+                        }                      
+                    }
+                    else
+                    {
+                        if (allLeafDescendentsAreParameters[ChildName] == false)
+                        {
+                            allParameters = false;
+                            break;
+                        }
+                    }
+                }
+                allLeafDescendentsAreParameters[myname] = allParameters;
+                if (allParameters)
+                {
+                    node->MarkValueNonSharable();
+                }
+                else
+                {
+                    node->MarkValueSharable();
                 }
             }
-            if (allChildrenNonSharable)
-                node->MarkValueNonSharable(); 
         }
         
     }
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 198c1dcb6421..8064bceb58f0 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         friend class ComputationNetwork;
 
         ComputationNetworkOwnedNodeState() :
-            m_needsGradient(false)
+            m_needsGradient(false), m_valueSharable(true)
         {
             PurgeStateForFormingRecurrentLoops();
             m_isPartOfLoop = false;
@@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         bool IsPartOfLoop() const { return m_isPartOfLoop; }
 
+        virtual void MarkValueNonSharable(){ m_valueSharable = false; }
+        virtual void MarkValueSharable() { m_valueSharable = true;    }
+        bool isValueSharable() { return m_valueSharable;  }
+        
     protected:  // TODO: should be fully encapsulated here
 
         bool m_needsGradient;   // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
 
+        bool m_valueSharable;   // a flag is needed for memory share. 
+                                // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), 
+                                // it will never be released to memory pool 
     private:
 
         bool m_isPartOfLoop;        // true if this loop is part of a recurrent loop
@@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
             m_parameterUpdateRequired(false), m_gradientInitialized(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0), m_valueSharable(true)
+            m_numRows(0), m_numCols(0) 
         { }
         virtual ~ComputationNodeBase(){}
 
@@ -455,14 +462,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
-        bool isValueSharable() 
-        {
-            return m_valueSharable; 
-        }
-        virtual void MarkValueNonSharable()
-        {
-            m_valueSharable = false; 
-        }
+        // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter)
+        virtual bool isLearnableParameter() const { return false; }
+
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
 
@@ -778,8 +780,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
         bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
 
-        // flags related with sharable values 
-        bool    m_valueSharable; // whether value is sharable 
     };
     typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index f7096fa34efa..06571ad6bb8f 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -197,6 +197,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             PrintNodeValuesToFile(printValues, fstream);
         }
+
+        virtual bool isLearnableParameter()const override{ return true; }
     };
 
 #if 0

From 4b1f8006b3ec4031804fac396f087842d602b02b Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Thu, 17 Dec 2015 17:44:22 -0800
Subject: [PATCH 25/49] Fix MarkValueNotSharableNodes

---
 .../ComputationNetwork.h                      |  2 +-
 .../ComputationNetworkEvaluation.cpp          | 90 +++++++++----------
 .../ComputationNetworkLib/ComputationNode.h   |  6 +-
 .../InputAndParamNodes.h                      |  1 -
 4 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index b533602786c5..e8d7ae87fe7e 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -159,7 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 private:
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
     void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
-    void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes();
 private:
     void DetermineSetOfAllRoots();
     void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 03d085be8d2b..208692e4561e 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -10,6 +10,7 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "RecurrentNodes.h"
+#include "InputAndParamNodes.h"
 #include <string>
 #include <vector>
 #include <list>
@@ -414,13 +415,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         for (auto & node : m_allRoots)
             ValidateSubNetwork(node);
 
-        // STEP:  mark non-sharable function values 
-        // if all the descendants of a particular node are learnable parameters, 
-        // its function value is not sharable 
-        for (auto & node : m_allRoots)
-            MarkValueNonSharableNodes(node);
-       
-
         // STEP: Optimize the network.
         // :)
 
@@ -686,11 +680,48 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+#if 0
+    // prepare to compute with the subnetwork that this rootNode depends on, including
+    //  - auto-detecting recurrent loops
+    //  - collect input and learnable nodes
+    //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
+    // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already.
+    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
+    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
+    {
+        bool inserted = m_built.insert(rootNode).second;  // remember we built it
+        if (!inserted)
+            return;                                             // already done
+
+        // detect recurrent loops for this root node
+        // TODO: not nice--why not always call this in ValidateSubNetwork() only?
+        FormRecurrentLoops(rootNode);
+
+        // for the m_inputValues and m_learnableParameters sets for this rootNode
+        CollectInputAndLearnableParameters(rootNode);
+
+        // validate the rootNode and all nodes it depends on, in evaluation order
+        ValidateSubNetwork(rootNode);
+    }
+
+    // tests whether BuildAndValidateSubNetwork() was called
+    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
+    {
+        return m_built.find(rootNode) != m_built.end();
+    }
+#endif
+
+    // -----------------------------------------------------------------------
+    // memory allocation
+    // -----------------------------------------------------------------------
     // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
-    void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode)
+    void ComputationNetwork::MarkValueNonSharableNodes()
     {
-        const auto & nodes = GetEvalOrder(rootNode);
+        const auto & nodes = GetEvalOrder(nullptr);
         std::map<wstring, bool>    allLeafDescendentsAreParameters; 
+        std::list<ComputationNodeBasePtr>    allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); 
+        // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not 
+
         for (auto& node : nodes)
         {
             auto children = node->GetInputs(); 
@@ -706,7 +737,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         // not found, means it is a leaf node (we are at eval order )
                         assert(child->IsLeaf());
-                        if (node->isLearnableParameter())
+                        if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
                         {
                             allLeafDescendentsAreParameters[ChildName] = true; 
                         }
@@ -740,40 +771,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         
     }
 
-#if 0
-    // prepare to compute with the subnetwork that this rootNode depends on, including
-    //  - auto-detecting recurrent loops
-    //  - collect input and learnable nodes
-    //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
-    // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already.
-    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
-    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
-    {
-        bool inserted = m_built.insert(rootNode).second;  // remember we built it
-        if (!inserted)
-            return;                                             // already done
-
-        // detect recurrent loops for this root node
-        // TODO: not nice--why not always call this in ValidateSubNetwork() only?
-        FormRecurrentLoops(rootNode);
-
-        // for the m_inputValues and m_learnableParameters sets for this rootNode
-        CollectInputAndLearnableParameters(rootNode);
-
-        // validate the rootNode and all nodes it depends on, in evaluation order
-        ValidateSubNetwork(rootNode);
-    }
-
-    // tests whether BuildAndValidateSubNetwork() was called
-    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
-    {
-        return m_built.find(rootNode) != m_built.end();
-    }
-#endif
-
-    // -----------------------------------------------------------------------
-    // memory allocation
-    // -----------------------------------------------------------------------
 
     // this function will need to be called before actual validation and execution to 
     // predetermine how to share matrices to reduce memory usage.
@@ -788,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         VerifyIsCompiled("AllocateAllMatrices");
 
+        // Due to special topology, if a node is solely induced by parameters, its function value should not be shared  
+        MarkValueNonSharableNodes();
+
         bool performingBackPropagation = (trainRootNode != nullptr);
 
-        // Create a composite Eval order with the specfied nodes as roots
+        // Create a composite Eval order with the specified nodes as roots
         std::vector<ComputationNodeBasePtr> forwardPropRoots;
         forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
         forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 8064bceb58f0..5edfdfe9aae9 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -168,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void MarkValueNonSharable(){ m_valueSharable = false; }
         virtual void MarkValueSharable() { m_valueSharable = true;    }
-        bool isValueSharable() { return m_valueSharable;  }
+        bool isValueSharable() const { return m_valueSharable;  }
         
     protected:  // TODO: should be fully encapsulated here
 
@@ -462,8 +462,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
-        // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter)
-        virtual bool isLearnableParameter() const { return false; }
 
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
@@ -547,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
         bool IsOutputNeededDuringBackprop() const 
         {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable();
         }
 
         const size_t GetNumInputs() const { return m_inputs.size(); }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 06571ad6bb8f..2a8a06c2beb4 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -198,7 +198,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             PrintNodeValuesToFile(printValues, fstream);
         }
 
-        virtual bool isLearnableParameter()const override{ return true; }
     };
 
 #if 0

From 0b8e30ea22d0552f3749cc65a589daedd26f11da Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Fri, 18 Dec 2015 14:21:21 -0800
Subject: [PATCH 26/49] Revise the condition of
 ReleaseMatricesAfterForwardProp: only ValueSharable nodes can be released
 after forwardprop

---
 Source/ComputationNetworkLib/ComputationNode.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 5edfdfe9aae9..ec9da238ac50 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -545,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
         bool IsOutputNeededDuringBackprop() const 
         {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable();
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ;
         }
 
         const size_t GetNumInputs() const { return m_inputs.size(); }
@@ -912,7 +912,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //don't release matrices that need to be used in the gradient computation
         virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
         {
-            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE))
+            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
                 ReleaseMatrixToPool(m_value, matrixPool);
         }
 

From 2f51fb24b60d484122e3f1b5948df8c2f97d7d98 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Fri, 18 Dec 2015 23:32:59 -0800
Subject: [PATCH 27/49] Fix a bug in MarkValueSharableNode

---
 Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp | 2 +-
 Source/ComputationNetworkLib/ComputationNode.h                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 208692e4561e..3da5a5ce51df 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -736,7 +736,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
                     {
                         // not found, means it is a leaf node (we are at eval order )
-                        assert(child->IsLeaf());
+                        assert(child->IsLeaf() || child->IsPartOfLoop());
                         if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
                         {
                             allLeafDescendentsAreParameters[ChildName] = true; 
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index ec9da238ac50..fd90a96ac168 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -941,7 +941,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 // Release the Value matrix only if the output value is needed during backprop
                 // since in the case it isn't used, we release it during forward prop itself
-                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE)
+                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
                     ReleaseMatrixToPool(m_value, matrixPool);
             }
         }

From b54cfccc37dd08eccae9c9e45baa6691fdd0d720 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Sat, 19 Dec 2015 00:18:13 -0800
Subject: [PATCH 28/49] Add an alternate option "numSubminibatches" for users
 to indicate how to split minibatches into subminibatches.

---
 Source/SGDLib/SGD.cpp | 25 ++++++++++++++++++-------
 Source/SGDLib/SGD.h   |  6 +++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index c816931c1884..bab3d7a896d1 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -764,13 +764,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
         DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
         size_t numSubminibatchesNeeded = 0; 
-        if (m_maxSamplesInRAM < SIZE_MAX)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
+        if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
         {
-            // into how many pieces would we need to break the minibatch?
-            // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
-            size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
-            size_t estimatedMBSize = tunedMBSize * numParallelSequences; 
-            numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);             
+            if (m_maxSamplesInRAM < SIZE_MAX)
+            {
+                // into how many pieces would we need to break the minibatch?
+                // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
+                size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
+                size_t estimatedMBSize = tunedMBSize * numParallelSequences;
+                numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
+            }
+            if (m_numSubminiBatches > 1)
+            {
+                numSubminibatchesNeeded = m_numSubminiBatches;
+            }
         }
         // this is non-trivial, we need a manager object to handle this
         if (numSubminibatchesNeeded > 1)
@@ -800,7 +807,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         if (numSubminibatchesNeeded > 1)
         {
-            fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            if (m_maxSamplesInRAM < SIZE_MAX)
+                fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            else
+                fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
         }
         fprintf(stderr, ".\n");
 
@@ -2484,6 +2494,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
         m_truncated = configSGD(L"truncated", false);
         m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX);
+        m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1);
 
         // the number of samples in each epoch (0 means, use all the samples in each epoch).
         m_epochSize = configSGD(L"epochSize", (size_t)0);
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 15143dfa0208..b99608b500f8 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -157,7 +157,11 @@ struct SGDParams : public ScriptableObjects::Object
     // To mitigate this issue, we adopt the sub-minibatch implementation, where 
     // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM
     // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch 
-
+    size_t m_numSubminiBatches; 
+    // alternative method to specify how to split minibatches into subminibatches 
+    // default is 1, which means no subminibatch is used 
+    // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 
+    // we divide one minibatch to m_numSubminiBatches subMinibatches 
 
     // the number of samples in each epoch (0 means, use all the samples in each epoch).
     size_t m_epochSize;

From 6777bbe0b757b5204edd8b15a95c1857890e9fd3 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 21 Dec 2015 21:56:33 -0800
Subject: [PATCH 29/49] Display CUB and CUDNN paths (if defined) in BuildInfo

Print BuildInfo at the very begining of the program. convenient for checking
build type.
---
 Source/CNTK/CNTK.cpp      |  8 ++++++--
 Source/CNTK/prebuild.bat  | 10 ++++++++++
 Tools/generate_build_info |  3 +++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index e7753e2c70b1..e85dd78015f5 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -345,6 +345,9 @@ void PrintBuiltInfo()
 #ifdef _CUB_PATH_
     fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif 
+#ifdef _CUDNN_PATH_
+    fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+#endif
 #ifdef _GIT_EXIST
     fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
     fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
@@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
         RedirectStdErr(logpath);
     }
 
-    PrintBuiltInfo();
+    PrintBuiltInfo(); // this one goes to log file 
     std::string timestamp = TimeDateStamp();
 
     //dump config info
@@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
 // main wrapper that catches C++ exceptions and prints them
 // ---------------------------------------------------------------------------
 
-int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & repots Win32 exceptions
+int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
     try
     {
+        PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
         if (argc <= 1)
             InvalidArgument("No command-line argument given.");
         // detect legacy CNTK configuration
diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat
index 9f841d104da7..12631cf52e37 100644
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@@ -33,6 +33,16 @@ if "%cuda_path%" == "" (
         echo #define _CUDA_PATH_    "%cuda_path:\=\\%" >> buildinfo.h$$
     )
 
+if not "%cudnn_path%" == "" (
+    echo #define _CUDNN_PATH_  "%cudnn_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+if not "%cub_path%" == "" (
+    echo #define _CUB_PATH_  "%cub_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+
+
 echo #endif >> buildinfo.h$$
 
 ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time)
diff --git a/Tools/generate_build_info b/Tools/generate_build_info
index a155fc84e792..62686222ef33 100755
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@@ -56,6 +56,9 @@ makebuildinfo()
 	if [ ! -z "$CUB_PATH" ]; then 
 		printf "#define _CUB_PATH_ \"%s\"\n"  $CUB_PATH  >> $target
 	fi
+    if [ ! -z "$CUDNN_PATH" ]; then 
+        printf "#define _CUDNN_PATH_ \"%s\"\n"  $CUDNN_PATH  >> $target
+    fi
 	printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE    	>> 	$target
 	printf "#endif\n" 					>>	$target
 }

From 60989d7acbf06878c131dfd36aebae22096d54de Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 21 Dec 2015 23:20:13 -0800
Subject: [PATCH 30/49] Move MarkValueNonsharable out of constuctors

(make gcc happy)
---
 Source/ComputationNetworkLib/CompositeComputationNodes.h | 6 ++----
 Source/ComputationNetworkLib/ComputationNode.h           | 3 +--
 Source/ComputationNetworkLib/EsotericNodes.h             | 3 +--
 Source/ComputationNetworkLib/InputAndParamNodes.h        | 1 -
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 5027ef6c2bd9..f8f79dc21642 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -234,8 +234,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void MarkComputed(const bool hasComputed)
         {
             m_hasComputed = hasComputed;
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
         }
 
         virtual bool RequiresPreCompute() const override { return true; }
@@ -294,8 +293,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
         virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
         {
-            //CreateMatrixIfNull(m_value);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
         }
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index fd90a96ac168..b239cf1b633b 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -824,8 +824,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Since the dimensions are read as well, this function also updates m_numRows/m_numCols.
         void LoadValue(File& fstream)
         {
-            // CreateMatrixIfNull(m_value);
-            MarkValueNonSharable(); 
+            CreateMatrixIfNull(m_value);
             fstream >> Value();
             // above reads dimensions, so we must update our own m_numRows/m_numCols
             SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h
index b5c6a6f46ee5..3f500421cf1d 100644
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@@ -1550,8 +1550,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             Init(row_size, col_size);
-            //CreateMatrixIfNull(m_gradient);
-            MarkValueNonSharable();
+            CreateMatrixIfNull(m_gradient);
             m_gradient->Resize(row_size, col_size);
             m_gradient->SetValue(0.0f);
         }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 2a8a06c2beb4..333ac54053fb 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -255,7 +255,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(const TensorShape & sampleLayout, bool isSparse)
         {
             m_isSparse = isSparse;
-            //CreateMatrixIfNull(m_value);
             MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();

From 8aa59f700917e34653332d7c874be3860a73f0b7 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 22 Dec 2015 00:19:22 -0800
Subject: [PATCH 31/49] (further remove MarkValueNotSharable out of
 constructor)

---
 Source/ComputationNetworkLib/InputAndParamNodes.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 333ac54053fb..879ca71019ec 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -255,7 +255,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(const TensorShape & sampleLayout, bool isSparse)
         {
             m_isSparse = isSparse;
-            MarkValueNonSharable();
             if (isSparse)
                 ConvertToSparseMatrix();
 

From 2d7b74e825d827dd780ac7df4e16fe8eeec61e46 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 22 Dec 2015 12:18:21 -0800
Subject: [PATCH 32/49] (Fix a bug in MarkValueSharable)

---
 Source/ComputationNetworkLib/InputAndParamNodes.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 879ca71019ec..bf6fe610c035 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -255,12 +255,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Init(const TensorShape & sampleLayout, bool isSparse)
         {
             m_isSparse = isSparse;
+            CreateMatrixIfNull(m_value);
             if (isSparse)
                 ConvertToSparseMatrix();
 
             SetDims(sampleLayout, 0);
             UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
             m_parameterUpdateRequired = false;
+            m_valueSharable = false; 
         }
     protected:
         InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :

From 25fd18bf1f5b342b9c757fe4982dce77ee288fc5 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Mon, 11 Jan 2016 17:26:52 -0800
Subject: [PATCH 33/49] Fix an error in
 SequenceWithSoftmaxNode::RequestMatricesBeforeForwardProp

---
 Source/ComputationNetworkLib/TrainingCriterionNodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index d150b5747703..1722f60aa938 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -1374,7 +1374,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 		//request matrices needed to do node function value evaluation
 		virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
 		{
-			Base::ReleaseMatricesAfterForwardProp(matrixPool);
+			Base::ReleaseMatricesAfterBackprop(matrixPool);
 			ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
 			ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
 			ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);

From f76412385d8bd01c449d1c93c8701eb4b3bec859 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Mon, 11 Jan 2016 22:45:17 -0800
Subject: [PATCH 34/49] Bug workaround: The m_columnsValidityMask matrix in
 MBLayout type was being default initialized resulting in incorrectly
 selecting a bad GPU device.

---
 Source/Common/Include/Sequences.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 57e7e366ddda..c708623695e3 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // construction
         // -------------------------------------------------------------------
 
-        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
+        MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE), m_columnsValidityMask(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); }
         MBLayout() : MBLayout(1, 0) { }
 
         // copy the content of another MBLayoutPtr over

From f52e80cf8fcc192bee007937f4ec426043a70a94 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Fri, 8 Jan 2016 11:39:52 -0800
Subject: [PATCH 35/49] Added CMA to BN node, updated samples.

---
 .../Miscellaneous/CIFAR-10/03_ResNet.config   |  4 +--
 .../Miscellaneous/CIFAR-10/03_ResNet.ndl      | 12 ++++----
 .../Image/Miscellaneous/CIFAR-10/Macros.ndl   | 15 ++++++----
 Source/CNTK/SynchronousExecutionEngine.cpp    |  3 +-
 .../ComputationNetworkBuilder.cpp             |  4 +--
 .../ComputationNetworkBuilder.h               |  2 +-
 .../ConvolutionalNodes.h                      | 28 +++++++++++++------
 7 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
index 5497dcfab32a..45eb04b156fc 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@@ -34,7 +34,7 @@ Train=[
         minibatchSize=128
         learningRatesPerMB=0.1*80:0.01*40:0.001
         momentumPerMB=0.9
-        maxEpochs=10
+        maxEpochs=120
         L2RegWeight=0.0001
         dropoutRate=0
         
@@ -57,7 +57,7 @@ Train=[
             height=32
             channels=3
             cropType=Random
-            cropRatio=1
+            cropRatio=0.8
             jitterType=UniRatio
             interpolations=Linear
             #meanFile=
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
index 5b53f5d14652..cdee45f2f4fe 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@@ -48,14 +48,14 @@ DNN=[
     rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
     rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
                 
-    # pool
-    poolW = 3
-    poolH = 3
-    poolhStride = 2
-    poolvStride = 2
+    # Global average pooling
+    poolW = 8
+    poolH = 8
+    poolhStride = 1
+    poolvStride = 1
     pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = "cudnn")
 
-    ol = DnnLastLayer(576, labelDim, pool, fc1WScale, fc1BValue)
+    ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
     
     CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
     Err = ErrorPrediction(labels, ol, tag = Eval)
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
index 120369a40409..a5edb7ac0d11 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -11,7 +11,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
 {
     W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    #sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    sc = Parameter(outMap, 1, init = fixedValue, value = 1)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -24,7 +25,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale)
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = 1)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -34,7 +36,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale)
     
     W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = 1)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -48,7 +51,8 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, W
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = 1)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -58,7 +62,8 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, W
     
     W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = 1)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp
index 5cd5e845b312..40a166d47be6 100644
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@@ -452,8 +452,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 bool eval = node->GetOptionalParameter("eval", "false");
                 bool spatial = node->GetOptionalParameter("spatial", "false");
                 double expAvgFactor = node->GetOptionalParameter("expAvgFactor", "1.0");
+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
 
-                nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, expAvgFactor, name);
+                nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, expAvgFactor, imageLayoutKind, name);
             }
         }
         else
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index cddee2800f30..4c6a94c6d84c 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -610,9 +610,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType> shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::BatchNormalization(const ComputationNodePtr input, 
         const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev,
-        bool eval, bool spatial, double expAvgFactor, const std::wstring nodeName)
+        bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind, const std::wstring nodeName)
     {
-        return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, eval, spatial, expAvgFactor), 
+        return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, eval, spatial, expAvgFactor, imageLayoutKind), 
             input, scale, bias, runMean, runInvStdDev);
     }
 
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
index ce0dc84ccfde..6237cf7cef9e 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@@ -132,7 +132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ComputationNodePtr TimeReverse(const ComputationNodePtr input, const std::wstring nodeName = L"");
         ComputationNodePtr LookupTable(const ComputationNodePtr dictionary, const ComputationNodePtr input, const std::wstring nodeName = L"");
         ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias,
-            const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double expAvgFactor = 1, const std::wstring nodeName = L"");
+            const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double expAvgFactor = 1, ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L"");
     };
 
     // create a new from config
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index 92a3f073f9a3..d86deb2b3878 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -591,15 +591,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static const std::wstring TypeName() { return L"BatchNormalization"; }
     public:
         BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0)
+            Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_sampleCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
         {
         }
-        BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor) :
-            Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor)
+        BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind) :
+            Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_sampleCount(0)
         {
         }
         BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"), configp->Get(L"expAvgFactor"))
+            BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"), configp->Get(L"expAvgFactor"),
+                ImageLayoutKindFrom(configp->Get(L"imageLayout")))
         {
             AttachInputs(configp, this->GetExpectedNumInputs());
         }
@@ -612,6 +613,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fstream << m_eval;
             fstream << m_spatial;
             fstream << m_expAvgFactor;
+            fstream << (int32_t)m_imageLayoutKind;
+            fstream << m_sampleCount;
         }
 
         void Load(File& fstream, size_t modelVersion) override
@@ -635,6 +638,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fstream >> m_eval;
             fstream >> m_spatial;
             fstream >> m_expAvgFactor;
+            if (verWritten >= 0x00010002)
+            {
+                fstream >> m_imageLayoutKind;
+                fstream >> m_sampleCount;
+            }
         }
 
         void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -733,8 +741,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (isFinalValidationPass)
             {
-                const auto m_imageLayoutKind = ImageLayoutKind::CHW;        // BUGBUG: Finish this. Must be serialized.
-
                 auto shape = GetSampleLayout();
 
                 if (m_factory == nullptr)
@@ -794,8 +800,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     private:
         struct VersionInfo
         {
-            int32_t VerWrittenCur() const     { return 0x00010001; } // Initial
-            int32_t VerReadableCur() const    { return 0x00010001; }
+            //int32_t VerWrittenCur() const     { return 0x00010001; } // Initial
+            int32_t VerWrittenCur() const     { return 0x00010002; }   // Added m_imageLayoutKind and m_sampleCount
+            int32_t VerReadableCur() const    { return 0x00010002; }
             int32_t VerWeCanReadBack() const  { return 0x00010001; }
         };
         VersionInfo m_version;
@@ -808,6 +815,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_spatial;
         // Smoothing factor.
         double m_expAvgFactor;
+        // Layout (e.g. CHW).
+        ImageLayoutKind m_imageLayoutKind;
+        // Sample count, used to compute cumulative moving average.
+        size_t m_sampleCount;
+
         // Stores pre-computed on forward pass mean values that are used in gradient computation.
         shared_ptr<Matrix<ElemType>> m_saveMean;
         // Stores pre-computed on forward pass InvStdDev values that are used in gradient computation.

From cc2a836c85e04525817529381ae43f1e1c2a2607 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Fri, 8 Jan 2016 16:57:27 -0800
Subject: [PATCH 36/49] Updated samples, added ResNet-50.

---
 .../CIFAR-10/02_BatchNormConv.ndl             |  12 +-
 .../Miscellaneous/CIFAR-10/03_ResNet.config   |   2 +-
 .../Miscellaneous/CIFAR-10/03_ResNet.mel      |   5 +
 .../Miscellaneous/CIFAR-10/03_ResNet.ndl      |  24 ++--
 .../Image/Miscellaneous/CIFAR-10/Macros.ndl   |  52 ++++----
 .../Miscellaneous/ImageNet/ResNet/Macros.ndl  |  28 ++--
 .../ImageNet/ResNet/ResNet_152.ndl            |  30 +++--
 .../ImageNet/ResNet/ResNet_34.ndl             |  24 ++--
 .../ImageNet/ResNet/ResNet_50.config          | 123 ++++++++++++++++++
 .../ImageNet/ResNet/ResNet_50.ndl             |  80 ++++++++++++
 .../ConvolutionalNodes.h                      |  19 +--
 11 files changed, 316 insertions(+), 83 deletions(-)
 create mode 100644 Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config
 create mode 100644 Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
index e67e96ea8f10..c446156a2bbb 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
@@ -19,8 +19,10 @@ ndlMnistMacros = [
     conv3WScale = 1.414
     conv3BValue = 0
     
-    scScale = 0.03
+    scValue = 1
     
+    expAvg = 1
+  
     fc1WScale = 12
     fc1BValue = 0
     fc2WScale = 1.5
@@ -35,7 +37,7 @@ DNN=[
     hStride1 = 1
     vStride1 = 1
     # weight[cMap1, kW1 * kH1 * ImageC]
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue, scScale)
+    conv1 = ConvBNReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue, scValue, expAvg)
 
     # pool1
     pool1W = 3
@@ -51,7 +53,7 @@ DNN=[
     hStride2 = 1
     vStride2 = 1
     # weight[cMap2, kW2 * kH2 * cMap1]
-    conv2 = ConvBNReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue, scScale)
+    conv2 = ConvBNReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue, scValue, expAvg)
 
     # pool2
     pool2W = 3
@@ -67,7 +69,7 @@ DNN=[
     hStride3 = 1
     vStride3 = 1
     # weight[cMap3, kW3 * kH3 * cMap2]
-    conv3 = ConvBNReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue, scScale)
+    conv3 = ConvBNReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue, scValue, expAvg)
 
     # pool3
     pool3W = 3
@@ -77,7 +79,7 @@ DNN=[
     pool3 = MaxPooling(conv3, pool3W, pool3H, pool3hStride, pool3vStride, imageLayout = "cudnn")
 
     hiddenDim = 64
-    h1 = DnnBNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue)
+    h1 = DnnBNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue, scValue, expAvg)
     ol = DNNLastLayer(hiddenDim, labelDim, h1, fc2WScale, fc2BValue)
     
     CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
index 45eb04b156fc..c3fd40bfefff 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@@ -34,7 +34,7 @@ Train=[
         minibatchSize=128
         learningRatesPerMB=0.1*80:0.01*40:0.001
         momentumPerMB=0.9
-        maxEpochs=120
+        maxEpochs=160
         L2RegWeight=0.0001
         dropoutRate=0
         
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
index 115e1f43af88..3c1ef2e34716 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
@@ -23,6 +23,9 @@ rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1.
 SetNodeInput(rn2_1.y1, 0, rn2_1.bn1_e)
 rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e)
+#rn2_1.bn_proj_e = BatchNormalization(rn2_1.c_proj, rn2_1.sc_proj, rn2_1.b_proj, rn2_1.m_proj, rn2_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn")
+SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e)
+#SetNodeInput(rn2_1.p, 1, rn2_1.bn_proj_e)
 
 rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_2.y1, 0, rn2_2.bn1_e)
@@ -37,7 +40,9 @@ SetNodeInput(rn2_3.p, 0, rn2_3.bn2_e)
 rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_1.y1, 0, rn3_1.bn1_e)
 rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
+#rn3_1.bn_proj_e = BatchNormalization(rn3_1.c_proj, rn3_1.sc_proj, rn3_1.b_proj, rn3_1.m_proj, rn3_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_1.p, 0, rn3_1.bn2_e)
+#SetNodeInput(rn3_1.p, 1, rn3_1.bn_proj_e)
 
 rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_2.y1, 0, rn3_2.bn1_e)
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
index cdee45f2f4fe..d84a9de37dbe 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@@ -17,7 +17,9 @@ LocalMacros = [
     fc1WScale = 12
     fc1BValue = 0
     
-    scValue = 0.03
+    scValue = 1
+    
+    expAvg = 1
     
     kW = 3
     kH = 3
@@ -30,23 +32,23 @@ LocalMacros = [
 
 DNN=[
     cMap1 = 16
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue)
+    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue, expAvg)
 
-    rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
-    rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
-    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
+    rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
 
     cMap2 = 32
     rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
-    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj)
-    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
-    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
+    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
 
     cMap3 = 64
     rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
-    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj)
-    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
-    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
+    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
                 
     # Global average pooling
     poolW = 8
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
index a5edb7ac0d11..6f2dcde046cf 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -7,71 +7,77 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
     y = RectifiedLinear(p);
 }
 
-ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scScale)
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
 {
     W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
-    sc = Parameter(outMap, 1, init = fixedValue, value = 1)
+    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
     y = RectifiedLinear(bn);
 }
 
-ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale)
+ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, expAvg)
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = 1)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
     y1 = RectifiedLinear(bn1);
     
     W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = 1)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
     p = Plus(bn2, inp)
     y2 = RectifiedLinear(p);
 }
 
-ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, Wproj)
+ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, expAvg, Wproj)
 {
+    # First convolution layer.
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = 1)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
     y1 = RectifiedLinear(bn1);
     
+    # Second convolution layer.
     W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = 1)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
     
-    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
-    p = Plus(bn2, cproj)
+    # Projection convolution layer.
+    #b_proj = Parameter(outMap, 1, init = fixedValue, value = bValue)
+    #sc_proj = Parameter(outMap, 1, init = fixedValue, value = scValue)
+    #m_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    #isd_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    
+    c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
+    #bn_proj = BatchNormalization(c_proj, sc_proj, b_proj, m_proj, isd_proj, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
+    
+    #p = Plus(bn2, bn_proj)
+    p = Plus(bn2, c_proj)
     y2 = RectifiedLinear(p);
 }
 
@@ -84,15 +90,15 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue)
     y = RectifiedLinear(z)
 }
 
-DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
+DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg)
 {
     W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
     b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
-    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
+    sc = Parameter(outDim, 1, init = fixedValue, value = scValue) 
     m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
     isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, imageLayout = "cudnn")
+    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg)
     y = RectifiedLinear(bn)
 }
 
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
index f3bc70221b4b..47af2feb1936 100644
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@@ -2,7 +2,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
 {
     W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -16,7 +16,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -26,7 +26,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
     
     W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -41,7 +41,7 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -51,7 +51,7 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
     
     W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -69,7 +69,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
     # 1x1 reducing convolution.
     W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -80,7 +80,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
     # 3x3 convolution.
     W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -91,7 +91,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
     # 1x1 expanding convolution.
     W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale)
     b3 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc3 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -102,12 +102,12 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
     y3 = RectifiedLinear(p);
 }
 
-ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, wProj)
+ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, wProj, projStride)
 {
     # 1x1 reducing convolution.
     W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue)
     m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -118,18 +118,18 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue,
     # 3x3 convolution.
     W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale)
     b2 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue)
     m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
     
-    c2 = Convolution(W2, y1, 3, 3, convMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
+    c2 = Convolution(W2, y1, 3, 3, convMap, projStride, projStride, zeroPadding = true, imageLayout = "cudnn")
     bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
     y2 = RectifiedLinear(bn2);
     
     # 1x1 expanding convolution.
     W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale)
     b3 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc3 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue)
     m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
@@ -137,7 +137,7 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue,
     bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn")
     
     # Increasing input dimension convolution
-    cProj = Convolution(wProj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
+    cProj = Convolution(wProj, inp, 1, 1, outMap, projStride, projStride, zeroPadding = false, imageLayout = "cudnn")
     
     p = Plus(bn3, cProj)
     y3 = RectifiedLinear(p);
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
index 23b4fb86e038..71c54e3fff67 100644
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@@ -8,8 +8,6 @@ ndlMacros = [
     LabelDim = 1000
 
     features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    featOffs = Const(0, rows = 150528)
-    featScaled = Plus(features, featOffs)
     labels = Input(LabelDim, tag = label)
     
     # Kernels width and height.
@@ -28,7 +26,7 @@ ndlMacros = [
     # Initial parameter values.
     convWScale = 7.07
     convBValue = 0
-    scValue = 0.03
+    scValue = 1
     fcWScale = 3.0
     fcBValue = 1
 ]
@@ -41,16 +39,21 @@ DNN=[
     cMap5 = 1024
     cMap6 = 2048
     
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    # Max pooling
+    pool1W = 2
+    pool1H = 2
+    pool1hs = 2
+    pool1vs = 2
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
     
     rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false)
-    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj)
+    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1)
     rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
     rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
 
     rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
-    rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj)
+    rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2)
     rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
     rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
     rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
@@ -60,7 +63,7 @@ DNN=[
     rn2_8 = ResNetNode3(rn2_7, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
     
     rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false)
-    rn3_1 = ResNetNode3Inc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj)
+    rn3_1 = ResNetNode3Inc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2)
     rn3_2 = ResNetNode3(rn3_1,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
     rn3_3 = ResNetNode3(rn3_2,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
     rn3_4 = ResNetNode3(rn3_3,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
@@ -98,13 +101,18 @@ DNN=[
     rn3_36= ResNetNode3(rn3_35, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
 
     rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false)
-    rn4_1 = ResNetNode3Inc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj)
+    rn4_1 = ResNetNode3Inc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2)
     rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
     rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
 
-    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
+    # Global average pooling
+    pool2W = 7
+    pool2H = 7
+    pool2hs = 1
+    pool2vs = 1
+    pool2 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
 
-    ol = DnnLayer(8192, labelDim, pool5, fcWScale, fcBValue)
+    ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue)
     
     CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
     Err = ErrorPrediction(labels, ol, tag = Eval)
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
index c1297f32547e..73108ca6da1c 100644
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@@ -17,16 +17,10 @@ ndlMacros = [
     hs = 1
     vs = 1
     
-    # Pooling settings.
-    poolW = 2
-    poolH = 2
-    poolhs = 2
-    poolvs = 2
-    
     # Initial parameter values.
     convWScale = 7.07
     convBValue = 0
-    scValue = 0.03
+    scValue = 1
     fcWScale = 3.0
     fcBValue = 1
 ]
@@ -34,7 +28,12 @@ ndlMacros = [
 DNN=[
     cMap1 = 64
     conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
+    # Max pooling
+    pool1W = 2
+    pool1H = 2
+    pool1hs = 2
+    pool1vs = 2
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
     
     rn1_1 = ResNetNode2(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
     rn1_2 = ResNetNode2(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
@@ -62,9 +61,14 @@ DNN=[
     rn4_2 = ResNetNode2(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)
     rn4_3 = ResNetNode2(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)
 
-    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
+    # Global average pooling
+    pool2W = 7
+    pool2H = 7
+    pool2hs = 1
+    pool2vs = 1
+    pool5 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
 
-    ol = DnnLayer(4608, labelDim, pool5, fcWScale, fcBValue)
+    ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
     
     CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
     Err = ErrorPrediction(labels, ol, tag = Eval)
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config
new file mode 100644
index 000000000000..520e68bc7551
--- /dev/null
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config
@@ -0,0 +1,123 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl
+
+precision=float
+deviceId=Auto
+
+command=Train:AddTop5Eval:Test
+
+parallelTrain=false
+
+stderr=$OutputDir$/ResNet_50
+traceLevel=1
+
+Proj64to256Filename = $ConfigDir$/64to256.txt
+Proj256to512Filename = $ConfigDir$/256to512.txt
+Proj512to1024Filename = $ConfigDir$/512to1024.txt
+Proj1024to2048Filename = $ConfigDir$/1024to2048.txt
+
+Train=[
+    action=train
+    modelPath=$ModelDir$/ResNet_50
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/ResNet_50.ndl
+    ]
+    
+    SGD=[
+        epochSize=0
+        minibatchSize=32
+        learningRatesPerMB=0.1*30:0.03*30:0.01*25:0.003*25:0.001
+        momentumPerMB=0.9
+        maxEpochs=120
+        gradUpdateType=None
+        L2RegWeight=0.0001
+        dropoutRate=0
+        
+        ParallelTrain=[
+            parallelizationMethod=DataParallelSGD
+            distributedMBReading=true
+            parallelizationStartEpoch=1
+            DataParallelSGD=[
+                gradientBits=1
+            ]
+        ]
+        
+        numMBsToShowResult=100
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        # Map file which maps images to labels using the following format:
+        # <full path to image><tab><numerical label (0-based class id)>
+        # Example:
+        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
+        file=$DataDir$/train_map.txt
+        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
+        randomize=Auto
+        features=[
+            # Below are the required parameters.
+            width=224
+            height=224
+            channels=3
+            # Below are the optional parameters.
+            # Possible values: Center, Random. Default: Center
+            cropType=Random
+            # Horizontal random flip, will be enabled by default if cropType=Random
+            #hflip=0
+            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
+            cropRatio=0.46666:0.875
+            # Crop scale ratio jitter type.
+            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            jitterType=UniRatio
+            # Interpolation to use when scaling image to width x height size.
+            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
+            interpolations=Linear
+            # Stores mean values for each pixel in OpenCV matrix XML format.
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
+
+AddTop5Eval=[    
+    action=edit
+    CurModel=$ModelDir$/ResNet_50
+    NewModel=$ModelDir$/ResNet_50.Top5
+    editPath=$ConfigDir$/add_top5_layer.mel
+]
+
+Test=[
+    action=test
+    modelPath=$ModelDir$/ResNet_50.Top5
+    # Set minibatch size for testing.
+    minibatchSize=32
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/ResNet_50.ndl
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        file=$DataDir$/val_map.txt
+        randomize=None
+        features=[
+            width=224
+            height=224
+            channels=3
+            cropType=Center
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
new file mode 100644
index 000000000000..4d38a2af9315
--- /dev/null
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
@@ -0,0 +1,80 @@
+load=ndlMacros
+run=DNN
+
+ndlMacros = [
+    ImageW = 224
+    ImageH = 224
+    ImageC = 3
+    LabelDim = 1000
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    labels = Input(LabelDim, tag = label)
+    
+    # Kernels width and height.
+    kW = 3
+    kH = 3
+    # Kernel stride.
+    hs = 1
+    vs = 1
+    
+    # Initial parameter values.
+    convWScale = 7.07
+    convBValue = 0
+    scValue = 1
+    fcWScale = 3.0
+    fcBValue = 1
+]
+
+DNN=[
+    cMap1 = 64
+    cMap2 = 128
+    cMap3 = 256
+    cMap4 = 512
+    cMap5 = 1024
+    cMap6 = 2048
+    
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    # Max pooling
+    pool1W = 2
+    pool1H = 2
+    pool1hs = 2
+    pool1vs = 2
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
+    
+    rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false)
+    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1)
+    rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
+    rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
+
+    rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
+    rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2)
+    rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
+    rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
+    rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
+    
+    rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false)
+    rn3_1 = ResNetNode3Inc(rn2_4,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2)
+    rn3_2 = ResNetNode3(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_3 = ResNetNode3(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_4 = ResNetNode3(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_5 = ResNetNode3(rn3_4, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_6 = ResNetNode3(rn3_5, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+
+    rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false)
+    rn4_1 = ResNetNode3Inc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2)
+    rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
+    rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
+
+    # Global average pooling
+    pool2W = 7
+    pool2H = 7
+    pool2hs = 1
+    pool2vs = 1
+    pool2 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
+
+    ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index d86deb2b3878..dffa996af6cf 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -591,11 +591,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static const std::wstring TypeName() { return L"BatchNormalization"; }
     public:
         BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_sampleCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
+            Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
         {
         }
         BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind) :
-            Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_sampleCount(0)
+            Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
         {
         }
         BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
@@ -614,7 +614,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fstream << m_spatial;
             fstream << m_expAvgFactor;
             fstream << (int32_t)m_imageLayoutKind;
-            fstream << m_sampleCount;
+            fstream << m_mbCount;
         }
 
         void Load(File& fstream, size_t modelVersion) override
@@ -641,7 +641,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (verWritten >= 0x00010002)
             {
                 fstream >> m_imageLayoutKind;
-                fstream >> m_sampleCount;
+                fstream >> m_mbCount;
             }
         }
 
@@ -724,8 +724,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue);
             else
             {
-                m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, m_expAvgFactor, runMean, runInvStdDev,
+                // REVIEW alexeyk: hack, use m_expAvgFactor <= 0 to compute CMA.
+                double expAvgFactor = (m_expAvgFactor > 0) ? m_expAvgFactor : (1.0 / (1.0 + m_mbCount));
+                m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
                     sliceOutputValue, *m_saveMean, *m_saveInvStdDev);
+                m_mbCount++;
             }
 #if NANCHECK
             sliceOutputValue.HasNan("BatchNormalization");
@@ -801,7 +804,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         struct VersionInfo
         {
             //int32_t VerWrittenCur() const     { return 0x00010001; } // Initial
-            int32_t VerWrittenCur() const     { return 0x00010002; }   // Added m_imageLayoutKind and m_sampleCount
+            int32_t VerWrittenCur() const     { return 0x00010002; }   // Added m_imageLayoutKind and m_mbCount
             int32_t VerReadableCur() const    { return 0x00010002; }
             int32_t VerWeCanReadBack() const  { return 0x00010001; }
         };
@@ -817,8 +820,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         double m_expAvgFactor;
         // Layout (e.g. CHW).
         ImageLayoutKind m_imageLayoutKind;
-        // Sample count, used to compute cumulative moving average.
-        size_t m_sampleCount;
+        // Minibatch count, used to compute cumulative moving average.
+        size_t m_mbCount;
 
         // Stores pre-computed on forward pass mean values that are used in gradient computation.
         shared_ptr<Matrix<ElemType>> m_saveMean;

From 9e25b7e61a9ffede4ba903d3cfd86cb2d843c953 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Tue, 12 Jan 2016 13:37:38 -0800
Subject: [PATCH 37/49] Removed Resize from BN code. Updated samples.

---
 .../Miscellaneous/CIFAR-10/03_ResNet.config     |  6 +++---
 .../Image/Miscellaneous/CIFAR-10/03_ResNet.ndl  |  7 ++++---
 .../ComputationNetworkLib/ConvolutionalNodes.h  | 13 ++++++++++++-
 Source/Math/CuDnnConvolutionEngine.cpp          | 17 ++++++++++++-----
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
index c3fd40bfefff..dd6c394a6471 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@@ -34,7 +34,7 @@ Train=[
         minibatchSize=128
         learningRatesPerMB=0.1*80:0.01*40:0.001
         momentumPerMB=0.9
-        maxEpochs=160
+        maxEpochs=80
         L2RegWeight=0.0001
         dropoutRate=0
         
@@ -60,7 +60,7 @@ Train=[
             cropRatio=0.8
             jitterType=UniRatio
             interpolations=Linear
-            #meanFile=
+            meanFile=$ConfigDir$/CIFAR-10_mean.xml
         ]
         labels=[
             labelDim=10
@@ -97,7 +97,7 @@ Test=[
             cropRatio=1
             jitterType=UniRatio
             interpolations=Linear
-            #meanFile=
+            meanFile=$ConfigDir$/CIFAR-10_mean.xml
         ]
         labels=[
             labelDim=10
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
index d84a9de37dbe..f267db665f7a 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@@ -8,8 +8,8 @@ LocalMacros = [
     LabelDim = 10
 
     features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    featOffs = Const(128)
-    featScaled = Minus(features, featOffs)
+    #featOffs = Const(128)
+    #featScaled = Minus(features, featOffs)
     labels = Input(LabelDim, tag = label)
     
     convWScale = 7.07
@@ -31,8 +31,9 @@ LocalMacros = [
 ]
 
 DNN=[
+    conv1WScale = 0.26
     cMap1 = 16
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue, expAvg)
+    conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, expAvg)
 
     rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
     rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index dffa996af6cf..9c06b0795f23 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -726,12 +726,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 // REVIEW alexeyk: hack, use m_expAvgFactor <= 0 to compute CMA.
                 double expAvgFactor = (m_expAvgFactor > 0) ? m_expAvgFactor : (1.0 / (1.0 + m_mbCount));
+
+                if (m_saveMean->GetNumElements() != runMean.GetNumElements())
+                    m_saveMean->Resize(runMean.GetNumRows(), runMean.GetNumCols());
+                if (m_saveInvStdDev->GetNumElements() != runMean.GetNumElements())
+                    m_saveInvStdDev->Resize(runMean.GetNumRows(), runMean.GetNumCols());
+
                 m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
                     sliceOutputValue, *m_saveMean, *m_saveInvStdDev);
+
                 m_mbCount++;
             }
 #if NANCHECK
-            sliceOutputValue.HasNan("BatchNormalization");
+            sliceOutputValue.HasNan("BatchNormalization-output");
+            runMean.HasNan("BatchNormalization-runMean");
+            runInvStdDev.HasNan("BatchNormalization-runInvStdDev");
+            m_saveMean->HasNan("BatchNormalization-saveMean");
+            m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev");
 #endif
         }
 
diff --git a/Source/Math/CuDnnConvolutionEngine.cpp b/Source/Math/CuDnnConvolutionEngine.cpp
index ae9336d56910..e5c7c871c064 100644
--- a/Source/Math/CuDnnConvolutionEngine.cpp
+++ b/Source/Math/CuDnnConvolutionEngine.cpp
@@ -312,28 +312,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias, 
             bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, Mat& saveMean, Mat& saveInvStdDev) override
         {
+            const size_t crowIn = inT.w() * inT.h() * inT.c();
+            UNUSED(crowIn); // crowIn used only in asserts.
             if (spatial)
             {
                 assert(scaleBiasT.c() == inT.c());
                 assert(scaleBiasT.w() == 1);
                 assert(scaleBiasT.h() == 1);
+                assert(runMean.GetNumRows() == inT.c());
+                assert(runMean.GetNumCols() == 1);
+                assert(runInvStdDev.GetNumRows() == inT.c());
+                assert(runInvStdDev.GetNumCols() == 1);
             }
             else
             {
                 assert(scaleBiasT.c() == inT.c());
                 assert(scaleBiasT.w() == inT.w());
                 assert(scaleBiasT.h() == inT.h());
+                assert(runMean.GetNumRows() == crowIn);
+                assert(runMean.GetNumCols() == 1);
+                assert(runInvStdDev.GetNumRows() == crowIn);
+                assert(runInvStdDev.GetNumCols() == 1);
             }
             assert(scaleBiasT.n() == 1);
-            const size_t crowIn = inT.w() * inT.h() * inT.c();
             assert(crowIn == in.GetNumRows());
             assert(inT.n() == in.GetNumCols());
+            assert(saveMean.GetNumElements() >= runMean.GetNumElements());
+            assert(saveInvStdDev.GetNumElements() >= runInvStdDev.GetNumElements());
 
             cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-            runMean.Resize(spatial ? inT.c() : crowIn, 1);
-            runInvStdDev.Resize(runMean.GetNumRows(), 1);
-            saveMean.Resize(runMean.GetNumRows(), 1);
-            saveInvStdDev.Resize(runMean.GetNumRows(), 1);
             CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
                 t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
         }

From 92e8a4d136492be3bd3f5b4cd9031f3a5b1e45fb Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Tue, 12 Jan 2016 14:25:13 -0800
Subject: [PATCH 38/49] Added BN eval mode to MEL. Updated samples.

---
 .../CIFAR-10/02_BatchNormConv.mel             |  12 +-
 .../Miscellaneous/CIFAR-10/03_ResNet.config   |   2 +-
 .../Miscellaneous/CIFAR-10/03_ResNet.mel      |  53 +--------
 .../Miscellaneous/CIFAR-10/03_ResNet.ndl      |   2 -
 .../CIFAR-10/04_ResNet_56.config              | 106 +++++++++++++++++
 .../Miscellaneous/CIFAR-10/04_ResNet_56.ndl   | 110 ++++++++++++++++++
 Source/CNTK/ModelEditLanguage.cpp             |  44 ++++++-
 .../ComputationNetwork.h                      |   1 +
 .../ComputationNetworkEditing.cpp             |  38 ++++++
 .../ConvolutionalNodes.h                      |   5 +
 10 files changed, 306 insertions(+), 67 deletions(-)
 create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config
 create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
index 809ac67784e5..c36f29c41b9b 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
@@ -1,16 +1,6 @@
 m=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m)
 
-conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(conv1.y, 0, conv1.bn_e)
-
-conv2.bn_e = BatchNormalization(conv2.c, conv2.sc, conv2.b, conv2.m, conv2.isd, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(conv2.y, 0, conv2.bn_e)
-
-conv3.bn_e = BatchNormalization(conv3.c, conv3.sc, conv3.b, conv3.m, conv3.isd, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(conv3.y, 0, conv3.bn_e)
-
-h1.bn_e = BatchNormalization(h1.t, h1.sc, h1.b, h1.m, h1.isd, eval = true, spatial = false)
-SetNodeInput(h1.y, 0, h1.bn_e)
+SetPropertyForSubTree(CE, batchNormEvalMode, true)
 
 SaveModel(m, $NewModel$, format=cntk)
\ No newline at end of file
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
index dd6c394a6471..a32619c04a5b 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@@ -34,7 +34,7 @@ Train=[
         minibatchSize=128
         learningRatesPerMB=0.1*80:0.01*40:0.001
         momentumPerMB=0.9
-        maxEpochs=80
+        maxEpochs=160
         L2RegWeight=0.0001
         dropoutRate=0
         
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
index 3c1ef2e34716..c36f29c41b9b 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
@@ -1,57 +1,6 @@
 m=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m)
 
-conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(conv1.y, 0, conv1.bn_e)
-
-rn1_1.bn1_e = BatchNormalization(rn1_1.c1, rn1_1.sc1, rn1_1.b1, rn1_1.m1, rn1_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_1.y1, 0, rn1_1.bn1_e)
-rn1_1.bn2_e = BatchNormalization(rn1_1.c2, rn1_1.sc2, rn1_1.b2, rn1_1.m2, rn1_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_1.p, 0, rn1_1.bn2_e)
-
-rn1_2.bn1_e = BatchNormalization(rn1_2.c1, rn1_2.sc1, rn1_2.b1, rn1_2.m1, rn1_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_2.y1, 0, rn1_2.bn1_e)
-rn1_2.bn2_e = BatchNormalization(rn1_2.c2, rn1_2.sc2, rn1_2.b2, rn1_2.m2, rn1_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_2.p, 0, rn1_2.bn2_e)
-
-rn1_3.bn1_e = BatchNormalization(rn1_3.c1, rn1_3.sc1, rn1_3.b1, rn1_3.m1, rn1_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_3.y1, 0, rn1_3.bn1_e)
-rn1_3.bn2_e = BatchNormalization(rn1_3.c2, rn1_3.sc2, rn1_3.b2, rn1_3.m2, rn1_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn1_3.p, 0, rn1_3.bn2_e)
-
-rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_1.y1, 0, rn2_1.bn1_e)
-rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e)
-#rn2_1.bn_proj_e = BatchNormalization(rn2_1.c_proj, rn2_1.sc_proj, rn2_1.b_proj, rn2_1.m_proj, rn2_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e)
-#SetNodeInput(rn2_1.p, 1, rn2_1.bn_proj_e)
-
-rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_2.y1, 0, rn2_2.bn1_e)
-rn2_2.bn2_e = BatchNormalization(rn2_2.c2, rn2_2.sc2, rn2_2.b2, rn2_2.m2, rn2_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_2.p, 0, rn2_2.bn2_e)
-
-rn2_3.bn1_e = BatchNormalization(rn2_3.c1, rn2_3.sc1, rn2_3.b1, rn2_3.m1, rn2_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_3.y1, 0, rn2_3.bn1_e)
-rn2_3.bn2_e = BatchNormalization(rn2_3.c2, rn2_3.sc2, rn2_3.b2, rn2_3.m2, rn2_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn2_3.p, 0, rn2_3.bn2_e)
-
-rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_1.y1, 0, rn3_1.bn1_e)
-rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-#rn3_1.bn_proj_e = BatchNormalization(rn3_1.c_proj, rn3_1.sc_proj, rn3_1.b_proj, rn3_1.m_proj, rn3_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_1.p, 0, rn3_1.bn2_e)
-#SetNodeInput(rn3_1.p, 1, rn3_1.bn_proj_e)
-
-rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_2.y1, 0, rn3_2.bn1_e)
-rn3_2.bn2_e = BatchNormalization(rn3_2.c2, rn3_2.sc2, rn3_2.b2, rn3_2.m2, rn3_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_2.p, 0, rn3_2.bn2_e)
-
-rn3_3.bn1_e = BatchNormalization(rn3_3.c1, rn3_3.sc1, rn3_3.b1, rn3_3.m1, rn3_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_3.y1, 0, rn3_3.bn1_e)
-rn3_3.bn2_e = BatchNormalization(rn3_3.c2, rn3_3.sc2, rn3_3.b2, rn3_3.m2, rn3_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
-SetNodeInput(rn3_3.p, 0, rn3_3.bn2_e)
+SetPropertyForSubTree(CE, batchNormEvalMode, true)
 
 SaveModel(m, $NewModel$, format=cntk)
\ No newline at end of file
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
index f267db665f7a..3d3e69be6bb3 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@@ -8,8 +8,6 @@ LocalMacros = [
     LabelDim = 10
 
     features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
-    #featOffs = Const(128)
-    #featScaled = Minus(features, featOffs)
     labels = Input(LabelDim, tag = label)
     
     convWScale = 7.07
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config
new file mode 100644
index 000000000000..dc20fc41c14a
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config
@@ -0,0 +1,106 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl
+
+precision=float
+deviceId=Auto
+prefetch=true
+parallelTrain=false
+
+command=Train:AddBNEval:Test
+
+stderr=$OutputDir$/04_ResNet_56
+traceLevel=1
+numMBsToShowResult=200
+
+Proj16to32Filename = $ConfigDir$/16to32.txt
+Proj32to64Filename = $ConfigDir$/32to64.txt
+
+Train=[
+    action=train
+    modelPath=$ModelDir$/04_ResNet_56
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/04_ResNet_56.ndl
+    ]
+    
+    SGD=[
+        epochSize=0
+        minibatchSize=128
+        learningRatesPerMB=0.1*80:0.01*40:0.001
+        momentumPerMB=0.9
+        maxEpochs=1
+        L2RegWeight=0.0001
+        dropoutRate=0
+        
+        ParallelTrain=[
+            parallelizationMethod=DataParallelSGD
+            distributedMBReading=true
+            parallelizationStartEpoch=1
+            DataParallelSGD=[
+                gradientBits=1
+            ]
+        ]
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        file=$DataDir$/train_map.txt
+        randomize=Auto
+        features=[
+            width=32
+            height=32
+            channels=3
+            cropType=Random
+            cropRatio=0.8
+            jitterType=UniRatio
+            interpolations=Linear
+            meanFile=$ConfigDir$/CIFAR-10_mean.xml
+        ]
+        labels=[
+            labelDim=10
+        ]
+    ]    
+]
+
+AddBNEval=[    
+    action=edit
+    CurModel=$ModelDir$/04_ResNet_56
+    NewModel=$ModelDir$/04_ResNet_56.Eval
+    editPath=$ConfigDir$/03_ResNet.mel
+]
+
+Test=[
+    action=test
+    modelPath=$ModelDir$/04_ResNet_56
+    # Set minibatch size for testing.
+    minibatchSize=512
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/04_ResNet_56.ndl
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        file=$DataDir$/test_map.txt
+        randomize=Auto
+        features=[
+            width=32
+            height=32
+            channels=3
+            cropType=Center
+            cropRatio=1
+            jitterType=UniRatio
+            interpolations=Linear
+            meanFile=$ConfigDir$/CIFAR-10_mean.xml
+        ]
+        labels=[
+            labelDim=10
+        ]
+    ]    
+]
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
new file mode 100644
index 000000000000..98bceba2c9dd
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
@@ -0,0 +1,110 @@
+load=LocalMacros
+run=DNN
+
+LocalMacros = [
+    ImageW = 32
+    ImageH = 32
+    ImageC = 3
+    LabelDim = 10
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    labels = Input(LabelDim, tag = label)
+    
+    convWScale = 7.07
+    convBValue = 0
+    fc1WScale = 12
+    fc1BValue = 0
+    
+    scValue = 1
+    
+    expAvg = 1
+    
+    kW = 3
+    kH = 3
+    
+    hStride1 = 1
+    vStride1 = 1
+    hStride2 = 2
+    vStride2 = 2
+]
+
+DNN=[
+    conv1WScale = 0.26
+    cMap1 = 16
+    conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, expAvg)
+
+    rn1_1 = ResNetNode2(conv1,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_2 = ResNetNode2(rn1_1,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_3 = ResNetNode2(rn1_2,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_4 = ResNetNode2(rn1_3,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_5 = ResNetNode2(rn1_4,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_6 = ResNetNode2(rn1_5,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_7 = ResNetNode2(rn1_6,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_8 = ResNetNode2(rn1_7,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_9 = ResNetNode2(rn1_8,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_10= ResNetNode2(rn1_9,  cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_11= ResNetNode2(rn1_10, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_12= ResNetNode2(rn1_11, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_13= ResNetNode2(rn1_12, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_14= ResNetNode2(rn1_13, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_15= ResNetNode2(rn1_14, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_16= ResNetNode2(rn1_15, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_17= ResNetNode2(rn1_16, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg)
+
+    cMap2 = 32
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_4 = ResNetNode2(rn2_3, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_5 = ResNetNode2(rn2_4, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_6 = ResNetNode2(rn2_5, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_7 = ResNetNode2(rn2_6, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_8 = ResNetNode2(rn2_7, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_9 = ResNetNode2(rn2_8, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_10= ResNetNode2(rn2_9, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_11= ResNetNode2(rn2_10, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_12= ResNetNode2(rn2_11, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_13= ResNetNode2(rn2_12, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_14= ResNetNode2(rn2_13, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_15= ResNetNode2(rn2_14, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_16= ResNetNode2(rn2_15, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_17= ResNetNode2(rn2_16, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg)
+
+    cMap3 = 64
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_4 = ResNetNode2(rn3_3, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_5 = ResNetNode2(rn3_4, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_6 = ResNetNode2(rn3_5, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_7 = ResNetNode2(rn3_6, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_8 = ResNetNode2(rn3_7, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_9 = ResNetNode2(rn3_8, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_10= ResNetNode2(rn3_9, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_11= ResNetNode2(rn3_10, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_12= ResNetNode2(rn3_11, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_13= ResNetNode2(rn3_12, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_14= ResNetNode2(rn3_13, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_15= ResNetNode2(rn3_14, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_16= ResNetNode2(rn3_15, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_17= ResNetNode2(rn3_16, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_18= ResNetNode2(rn3_17, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+                
+    # Global average pooling
+    poolW = 8
+    poolH = 8
+    poolhStride = 1
+    poolvStride = 1
+    pool = AveragePooling(rn3_18, poolW, poolH, poolhStride, poolvStride, imageLayout = "cudnn")
+
+    ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
+
diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index 981b63ffd53f..9b1ec6fa176d 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -9,6 +9,7 @@
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
 
 #include "ModelEditLanguage.h"
+#include "ConvolutionalNodes.h"
 #include <map>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
@@ -56,7 +57,8 @@ enum MELProperty
     melPropFinalCriterion,
     melPropEvaluation,
     melPropOutput,
-    melPropRecurrent
+    melPropRecurrent,
+    melPropBatchNormMode
 };
 
 // SetProperty - Set the Property on the passed node
@@ -420,6 +422,10 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         {
             prop = melPropEvaluation;
         }
+        else if (EqualInsensitive(propName, "batchNormEvalMode"))
+        {
+            prop = melPropBatchNormMode;
+        }
         else if (EqualInsensitive(propName, "output"))
         {
             prop = melPropOutput;
@@ -485,6 +491,32 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                     // what to do here?
                     break;
                 }
+                case melPropBatchNormMode:
+                {
+                    if (node->OperationName() != OperationNameOf(BatchNormalizationNode))
+                    {
+                        RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.",
+                            node->NodeName().c_str(),
+                            node->OperationName().c_str(),
+                            OperationNameOf(BatchNormalizationNode).c_str());
+                    }
+                    bool property = params[2];
+                    auto pnode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
+                    if (pnode)
+                        pnode->SetEvalMode(property);
+                    else
+                    {
+                        auto pnode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
+                        if (pnode2)
+                            pnode2->SetEvalMode(property);
+                        else
+                        {
+                            RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n",
+                                node->NodeName().c_str());
+                        }
+                    }
+                    break;
+                }
                 default:
                 {
                     RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@@ -505,6 +537,10 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         {
             prop = melPropComputeGradient;
         }
+	    else if (EqualInsensitive(propName, "batchNormEvalMode"))
+	    {
+	        prop = melPropBatchNormMode; 
+	    }
         else
         {
             RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@@ -527,6 +563,12 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                     netNdl->cn->SetLearnableNodesBelowNeedGradient(needGradient, node);
                     break;
                 }
+                case melPropBatchNormMode:
+                {
+                    bool evalMode = params[2];
+                    netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node);
+                    break;
+                }
                 default:
                 {
                     RuntimeError("Invalid property, %s, is not supported", propName.c_str());
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 26b78d8be05f..0a9b3bf8ac29 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -344,6 +344,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     void AddFeatureNode(ComputationNodeBasePtr featureNode);
     void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
     void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr);
+    void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
 
     // -----------------------------------------------------------------------
     // node access
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
index bdd1a063114d..6dddc73a3942 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -10,6 +10,7 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "InputAndParamNodes.h"
+#include "ConvolutionalNodes.h"
 #include <string>
 #include <vector>
 #include <list>
@@ -314,4 +315,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
+    {
+        vector<ComputationNodeBasePtr>  nodes;
+        if (rootNode == nullptr)
+        {
+            for (auto pair : m_nameToNodeMap)
+            {
+                nodes.push_back(pair.second);
+            }
+        }
+        else
+        {
+            auto allnodes = rootNode->EnumerateNodes(true);
+            for (auto node : allnodes)
+                nodes.push_back(node);
+        }
+
+        for (auto& node : nodes)
+        {
+            if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
+            {
+                auto pNode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
+                if (!pNode)
+                {
+                    auto pNode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
+                    if (!pNode2)
+                    {
+                        RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
+                    }
+                }
+                else
+                {
+                    pNode->SetEvalMode(evalMode);
+                }
+            }
+        }
+    }
 }}}
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index 9c06b0795f23..3d2a7a34383d 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -811,6 +811,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 
+        void SetEvalMode(bool bnEvalMode)
+        {
+            m_eval = bnEvalMode;
+        }
+
     private:
         struct VersionInfo
         {

From 7b0159a41daa845f61a9314eebe01be585551d2c Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Tue, 12 Jan 2016 14:53:46 -0800
Subject: [PATCH 39/49] Added Python conversion script, updated readme.txt.

---
 .../Miscellaneous/CIFAR-10/CifarConverter.py  | 64 +++++++++++++++++++
 .../Image/Miscellaneous/CIFAR-10/readme.txt   |  5 ++
 2 files changed, 69 insertions(+)
 create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py
new file mode 100644
index 000000000000..b1be6d15b6d4
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py
@@ -0,0 +1,64 @@
+import os
+import sys
+import struct
+import cPickle as cp
+from PIL import Image
+import numpy as np
+import xml.etree.cElementTree as et
+import xml.dom.minidom
+
+imgSize = 32
+
+def saveImage(fname, data, label, mapFile, pad, **key_parms):
+    # data in CIFAR-10 dataset is in CHW format.
+    pixData = data.reshape((3, imgSize, imgSize))
+    if ('mean' in key_parms):
+        key_parms['mean'] += pixData
+
+    if pad > 0:
+        pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode = 'edge')
+
+    img = Image.new('RGB', (imgSize + 2 * pad, imgSize + 2 * pad))
+    pixels = img.load()
+    for x in range(img.size[0]):
+        for y in range(img.size[1]):
+            pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x])
+    img.save(fname)
+    mapFile.write("%s\t%d\n" % (fname, label))
+
+def saveMean(fname, data):
+    root = et.Element('opencv_storage')
+    et.SubElement(root, 'Channel').text = '3'
+    et.SubElement(root, 'Row').text = str(imgSize)
+    et.SubElement(root, 'Col').text = str(imgSize)
+    meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix')
+    et.SubElement(meanImg, 'rows').text = '1'
+    et.SubElement(meanImg, 'cols').text = str(imgSize * imgSize * 3)
+    et.SubElement(meanImg, 'dt').text = 'f'
+    et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (imgSize * imgSize * 3))])
+
+    tree = et.ElementTree(root)
+    tree.write(fname)
+    x = xml.dom.minidom.parse(fname)
+    with open(fname, 'w') as f:
+        f.write(x.toprettyxml(indent = '  '))
+
+if __name__ == "__main__":
+    rootDir = r'C:\Data\CIFAR-10' + '\\'
+    data = {}
+    dataMean = np.zeros((3, imgSize, imgSize)) # mean is in CHW format.
+    with open(rootDir + 'train_map.txt', 'w') as mapFile:
+        for ifile in range(1, 6):
+            with open(r'C:\Data\CIFAR-10\Python\data_batch_' + str(ifile), 'rb') as f:
+                data = cp.load(f)
+                for i in range(10000):
+                    fname = '%sdata\\train\\%05d.png' % (rootDir, i + (ifile - 1) * 10000)
+                    saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 4, mean=dataMean)
+    dataMean = dataMean / (50 * 1000)
+    saveMean('%sdata\\CIFAR-10_mean.xml' % rootDir, dataMean)
+    with open(rootDir + 'test_map.txt', 'w') as mapFile:
+        with open(r'C:\Data\CIFAR-10\Python\test_batch', 'rb') as f:
+            data = cp.load(f)
+            for i in range(10000):
+                fname = '%sdata\\test\\%05d.png' % (rootDir, i)
+                saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 0)
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/readme.txt b/Examples/Image/Miscellaneous/CIFAR-10/readme.txt
index ea57413fcdfd..27bc2939e4c7 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/readme.txt
+++ b/Examples/Image/Miscellaneous/CIFAR-10/readme.txt
@@ -19,5 +19,10 @@ The network produces 21% of error after training for about 3 minutes on GPU.
 To run the sample, navigate to this folder and run the following command:
 <path to CNTK executable> configFile=01_Conv.config configName=01_Conv
 
+02_BatchNormConv.ndl is a convolutional network which uses batch normalization technique (http://arxiv.org/abs/1502.03167).
+
+03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385).
+With 03_ResNet.config you should get around 10% of error.
+
 For more details, refer to .ndl and corresponding .config files.
 

From 914ac61c96e6c1b5f6aea15a2b876b2310ab6597 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 12 Jan 2016 15:27:32 -0800
Subject: [PATCH 40/49] Fix an inconsistency after merge with master.

---
 Source/ComputationNetworkLib/CompositeComputationNodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 26223101317a..8400407b79d6 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -296,7 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
-            SetDims(value.GetNumRows(), value.GetNumCols());
+            SetDims(TensorShape(value.GetNumRows()), value.GetNumCols());
         }
     public:
         bool m_hasComputed;

From 5bb9fbf6e6b1a9b2115c76fd45ec614550c2a9bc Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 12 Jan 2016 15:50:02 -0800
Subject: [PATCH 41/49] Fix a bug pointed by Alexey. Thanks!

---
 Source/CNTK/ModelEditLanguage.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index 94a1dc185fb7..844fd2c1bd5a 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -538,7 +538,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         {
             prop = melPropComputeGradient;
         }
-        if (EqualInsensitive(propName, "batchNormEvalMode"))
+        else if (EqualInsensitive(propName, "batchNormEvalMode"))
         {
             prop = melPropBatchNormMode; 
         }

From c45401fbd095f39e90165d4185ce812ff2d1f5ec Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 12 Jan 2016 17:08:46 -0800
Subject: [PATCH 42/49] Add Nesterov's momentum.

to use NAG, simply add

useNAG=true
---
 Source/Math/Matrix.cpp           | 51 ++++++++++++++++++++++++++++++--
 Source/Math/Matrix.h             |  2 +-
 Source/SGDLib/MultiNetworksSGD.h |  2 +-
 Source/SGDLib/SGD.cpp            | 18 +++++++----
 Source/SGDLib/SGD.h              |  8 +++--
 5 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 3650db859f33..52a9375796eb 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     template<class ElemType>
-    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
+    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, 
+                                      Matrix<ElemType>& functionValues, 
+                                      const ElemType learnRatePerSample, 
+                                      const ElemType momentum, 
+                                      const bool useNesterovMomentum
+                                      )
     {
         DecideAndMoveToRightDevice(*this, gradients, functionValues);
-
-        DISPATCH_MATRIX_ON_FLAG(&gradients,
+    
+        if (!useNesterovMomentum)
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
             nullptr,
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
             if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
             if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
             );
+        }
+        else
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
+            nullptr,
+            {/* CPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                ScaleAndAdd(-momentum, *this, functionValues);
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);
+                // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient, 
+            }, 
+            {/* GPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); 
+                ScaleAndAdd(-momentum, *this, functionValues); 
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);                 
+            }, 
+            { /* CPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId()); 
+                    gradientCache.SetValue(gradients); 
+                    gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); 
+                    ScaleAndAdd(-momentum, *this, functionValues); 
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); 
+                }
+            }, 
+            { /* GPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId());
+                    gradientCache.SetValue(gradients);
+                    gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
+                    ScaleAndAdd(-momentum, *this, functionValues);
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues);
+                }
+            }
+            );
+        }       
     }
 
     //both this and gradients will be changed
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 3791695299ff..94eb0dd53642 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void ShiftBy(int numShift);
 
         // TODO: all these scalars should be passed as doubles and cast down inside
-        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
+        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG);
         ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
         void FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
         ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
index 19f3f202526b..48d54f9af507 100644
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@@ -930,7 +930,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
 
-                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
+                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum);
                         }
                     }
                 }
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index bab3d7a896d1..854f3d8551a3 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -1001,7 +1001,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         UpdateWeights(node, smoothedGradient, learnRatePerSample,
                                       GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples,
                                       m_L2RegWeight, m_L1RegWeight,
-                                      m_needAveMultiplier);
+                                      m_needAveMultiplier, m_useNesterovMomentum);
 #ifdef _DEBUG
                         if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
                             LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
@@ -2022,7 +2022,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                size_t actualMBSize,
                                const double L2RegWeight,
                                const double L1RegWeight,
-                               const bool needAveMultiplier)
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               )
     {
         // we use simple linear (instead of log linear) scaling here
         const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
@@ -2063,7 +2065,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (adpType == GradientsUpdateType::None)
         {
             smoothedGradient.NormalGrad(gradientValues, functionValues,
-                                        (ElemType)learnRatePerSample, (ElemType)momentum);
+                                        (ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum);
         }
         else if (adpType == GradientsUpdateType::AdaGrad ||
                 (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
@@ -2113,7 +2115,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        const double momentumPerSample,
                        const size_t actualMBSize,
                        const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum
+                       ) const
     {
 #if DUMPOUTPUT
         fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
@@ -2124,7 +2128,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
                        smoothedGradient, learnRatePerSample, momentumPerSample,
                        actualMBSize, L2RegWeight, L1RegWeight,
-                       needAveMultiplier);
+                       needAveMultiplier, m_useNesterovMomentum);
         node->BumpEvalTimeStamp();
     }
 
@@ -2514,6 +2518,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         floatargvector momentumPerMB          = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
         floatargvector momentumPerSample      = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
         floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
+        bool           useNesterovMomentum = configSGD(L"useNAG", false); 
+
 
         m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0);
 
@@ -2633,6 +2639,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_momentumParam = floatargvector(L"0.9");
             m_momentumSpecifiedForMBSize = m_mbSize;
         }
+        m_useNesterovMomentum = useNesterovMomentum; 
+
         for (int i = 0; i < m_momentumParam.size(); i++)
         {
             if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index b99608b500f8..453f941c5bda 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -111,6 +111,7 @@ struct SGDParams : public ScriptableObjects::Object
     intargvector m_learningRatesSpecifiedForMBSize;       // 1 for per sample, m_mbSize[] for per MB
     floatargvector m_momentumParam;
     intargvector m_momentumSpecifiedForMBSize;
+    bool         m_useNesterovMomentum; 
 
     // Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value.
     // MB size is the number of samples across all time steps and parallel sequences.
@@ -440,7 +441,9 @@ class SGD : public SGDParams
                                size_t actualMBSize,
                                const double L2RegWeight,
                                const double L1RegWeight,
-                               const bool needAveMultiplier);
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               );
 
 protected:
     // UpdateWeights - update the weights in
@@ -450,7 +453,8 @@ class SGD : public SGDParams
                        const double momentumPerSample,
                        const size_t actualMBSize,
                        const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const;
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum) const;
 
     void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
 

From f0655f04d9ab0f6f828724b25bcac307a16c3098 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 12 Jan 2016 18:12:11 -0800
Subject: [PATCH 43/49] (make gcc happy)

---
 Source/SGDLib/MultiNetworksSGD.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
index 48d54f9af507..01ba28c098fe 100644
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         using SGDBase::m_L2RegWeight;
         using SGDBase::m_L1RegWeight;
         using SGDBase::m_needAveMultiplier;
+        using SGDBase::m_useNesterovMomentum;
         using SGDBase::m_traceLevel;
         using SGDBase::m_numMBsToShowResult;
         using SGDBase::m_gradientCheckSigDigit;

From 270726e324f774ce2339474a6cbe6726050f999d Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Tue, 12 Jan 2016 23:58:28 -0800
Subject: [PATCH 44/49] Make SequenceGammar calculation parameters
 configurable.

---
 .../ComputationNetwork.cpp                    | 19 ++++++++--
 .../ComputationNetwork.h                      | 14 +++++++-
 .../TrainingCriterionNodes.h                  | 16 +++++++++
 Source/SGDLib/SGD.cpp                         |  8 ++++-
 Source/SGDLib/SGD.h                           |  5 +++
 Source/SequenceTrainingLib/gammacalculation.h | 36 ++++++++++++++++++-
 6 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 8da7ba6c71fa..8e1019c25722 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -622,7 +622,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     //set sequence training parameters, e.g. smoothing weight, frame drop threshhold
     template<class ElemType>
-    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign)
+    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, 
+                                            const ComputationNodeBasePtr criterionNode, 
+                                            const double& hsmoothingWeight, 
+                                            const double& frameDropThresh, 
+                                            const bool& doreferencealign, 
+                                            const double& amf /*= 14.0f*/,
+                                            const double& lmf /*= 14.0f*/,
+                                            const double& wp  /*= 0.0f*/,
+                                            const double& bMMIfactor /*= 0.0f*/,
+                                            const bool&  sMBR /*= false*/
+                                            )
     {
         fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh);
         list<ComputationNodeBasePtr> seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode);
@@ -638,6 +648,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 node->SetSmoothWeight(hsmoothingWeight);
                 node->SetFrameDropThresh(frameDropThresh);
                 node->SetReferenceAlign(doreferencealign);
+                node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR);
             }
         }
     }
@@ -1118,14 +1129,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template void ComputationNetwork::LoadPersistableParameters<float>(File & fstream, bool create);
     template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
     template /*static*/void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                                const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 
     template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
     template void ComputationNetwork::Load<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
     template void ComputationNetwork::LoadPersistableParameters<double>(File & fstream, bool create);
     template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
     template /*static*/void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                            const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 
     // register ComputationNetwork with the ScriptableObject system
     ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 6d41a0b89569..3921d7434094 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -412,8 +412,20 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     template<class ElemType>
     static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
+
+
+
     template<class ElemType>
-    static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    static void SetSeqParam(ComputationNetworkPtr net, 
+                            const ComputationNodeBasePtr criterionNode, 
+                            const double&  hsmoothingWeight, 
+                            const double& frameDropThresh, 
+                            const bool&   doreferencealign, 
+                            const double& amf=14.0f, 
+                            const double& lmf=14.0f, 
+                            const double& wp=0.0f, 
+                            const double& bMMIfactor=0.0f, 
+                            const bool&  sMBR=false);
     static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);
 
     // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index 1722f60aa938..a0f00586c3cf 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -1418,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_doReferenceAlignment = doreferencealign;
         }
 
+        void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR)
+        {
+            msra::lattices::SeqGammarCalParam param; 
+            param.amf = amf; 
+            param.lmf = lmf; 
+            param.wp = wp; 
+            param.bMMIfactor = bMMIfactor; 
+            param.sMBRmode = sMBR;
+            m_gammaCalculator.SetGammarCalculationParams(param);
+        }
+
         void gettime(unsigned long long &gammatime, unsigned long long &partialtime)
         {
             gammatime = m_gammatime;
@@ -1430,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         shared_ptr<Matrix<ElemType>> m_gammaFromLattice;
         double m_frameDropThreshold;
         double m_fsSmoothingWeight;         // frame-sequence criterion interpolation weight    --TODO: can this be done outside?
+        double m_seqGammarAMF; 
+        double m_seqGammarLMF; 
+        double m_seqGammarWP; 
+        double m_seqGammarbMMIFactor;
+        double m_seqGammarUsesMBR; 
         bool m_doReferenceAlignment;
         std::vector<shared_ptr<const msra::dbn::latticepair>> m_lattices;
         msra::asr::simplesenonehmm m_hmm;
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 854f3d8551a3..11290caa33cb 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -303,7 +303,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // likewise for sequence training parameters
         if (isSequenceTrainingCriterion)
         {
-            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign);
+            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, 
+                m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR );
         }
 
         // --- MAIN EPOCH LOOP
@@ -2534,6 +2535,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
         m_frameDropThresh =  configSGD(L"frameDropThresh",  1e-10);
         m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
+        m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); 
+        m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
+        m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
+        m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); 
+        m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);
 
         m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector<float>{ 0.0f })));
 
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 453f941c5bda..a014ec1d2b13 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -250,6 +250,11 @@ struct SGDParams : public ScriptableObjects::Object
     double m_hSmoothingWeight;
     double m_frameDropThresh;
     bool m_doReferenceAlign;
+    double m_seqGammarCalcAMF;
+    double m_seqGammarCalcLMF; 
+    double m_seqGammarCalcWP;
+    double m_seqGammarCalcbMMIFactor; 
+    bool m_seqGammarCalcUsesMBR;
 };
 
 template<class ElemType> class IDistGradAggregator;
diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h
index f8a60e5bd478..f63c5048793c 100644
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@@ -11,6 +11,23 @@
 #pragma warning (disable: 4127) // conditional expression is constant
 
 namespace msra { namespace lattices {
+
+    struct  SeqGammarCalParam{
+        double amf; 
+        double lmf; 
+        double wp; 
+        double bMMIfactor; 
+        bool  sMBRmode; 
+        SeqGammarCalParam()
+        {
+            amf = 14.0; 
+            lmf = 14.0; 
+            wp = 0.0; 
+            bMMIfactor = 0.0;
+            sMBRmode = false;
+        }
+    };
+
     template<class ElemType>
     class GammaCalculation
     {
@@ -30,6 +47,9 @@ namespace msra { namespace lattices {
 
         }
 
+        //========================================
+        // Sec. 1 init functions
+        //========================================
         void init(msra::asr::simplesenonehmm hset, int DeviceId)
         {
             m_deviceid = DeviceId;
@@ -47,7 +67,21 @@ namespace msra { namespace lattices {
             }
         }
             
-            
+        //========================================
+        // Sec. 2 set functions 
+        //========================================
+        void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam)
+        {
+            lmf = (float)gammarParam.lmf;
+            amf = (float)gammarParam.amf;
+            wp =  (float)gammarParam.wp; 
+            seqsMBRmode = gammarParam.sMBRmode; 
+            boostmmifactor = (float)gammarParam.bMMIfactor;
+        }
+
+        //========================================
+        // Sec. 3 calculation functions 
+        //========================================
         void calgammaformb( Microsoft::MSR::CNTK::Matrix<ElemType>& functionValues, 
                             std::vector<shared_ptr<const msra::dbn::latticepair>> &lattices, 
                             const Microsoft::MSR::CNTK::Matrix<ElemType>& loglikelihood,

From ba61abd79ec85b0177a2c665a2e814adb059d939 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Fri, 8 Jan 2016 16:41:45 -0800
Subject: [PATCH 45/49] Disable popups on Windows

---
 Makefile                                                  | 2 +-
 Source/CNTK/CNTK.cpp                                      | 2 ++
 Source/EvalDll/EvalDll.vcxproj                            | 4 ++--
 Source/Math/Math.vcxproj                                  | 4 ++--
 Source/Math/MathCUDA.vcxproj                              | 2 +-
 Source/Readers/BinaryReader/BinaryReader.vcxproj          | 4 ++--
 Source/Readers/DSSMReader/DSSMReader.vcxproj              | 4 ++--
 Source/Readers/DataReaderTest/DataReaderTest.vcxproj      | 8 ++++----
 Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj          | 4 ++--
 Source/Readers/ImageReader/ImageReader.vcxproj            | 2 +-
 Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj  | 4 ++--
 Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj  | 4 ++--
 .../Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj | 4 ++--
 Source/Readers/SparsePCReader/SparsePCReader.vcxproj      | 4 ++--
 Source/Readers/UCIFastReader/UCIFastReader.vcxproj        | 4 ++--
 Source/Readers/UCIReader/UCIReader.vcxproj                | 8 ++++----
 Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj    | 2 +-
 17 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/Makefile b/Makefile
index 6c95042491d6..c2c474bffada 100644
--- a/Makefile
+++ b/Makefile
@@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug")
   CXXFLAGS += -g
   LDFLAGS += -rdynamic
   CPPFLAGS += -D_DEBUG
-  CUFLAGS += -O0 -use_fast_math -lineinfo  $(GENCODE_FLAGS)
+  CUFLAGS += -O0 -g -use_fast_math -lineinfo  $(GENCODE_FLAGS)
 endif
 
 ifeq ("$(BUILDTYPE)","release")
diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index e7753e2c70b1..94c91db877fe 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -684,6 +684,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st
 int wmain(int argc, wchar_t* argv[])    // wmain wrapper that reports Win32 exceptions
 {
     set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating
+    _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
+
     // Note: this does not seem to work--processes with this seem to just hang instead of terminating
     __try
     {
diff --git a/Source/EvalDll/EvalDll.vcxproj b/Source/EvalDll/EvalDll.vcxproj
index 71e515bc8520..a535ca3ff1c2 100644
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@@ -74,7 +74,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
@@ -102,7 +102,7 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index 950fab3417f8..5a101393d128 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -79,7 +79,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
@@ -127,7 +127,7 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj
index 9a1a5a6bec08..201b770a5687 100644
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <Profile>true</Profile>
diff --git a/Source/Readers/BinaryReader/BinaryReader.vcxproj b/Source/Readers/BinaryReader/BinaryReader.vcxproj
index 208fab6bc435..ac0f40baceec 100644
--- a/Source/Readers/BinaryReader/BinaryReader.vcxproj
+++ b/Source/Readers/BinaryReader/BinaryReader.vcxproj
@@ -70,7 +70,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/DSSMReader/DSSMReader.vcxproj b/Source/Readers/DSSMReader/DSSMReader.vcxproj
index 1412fac38f20..d607a7c9fc38 100644
--- a/Source/Readers/DSSMReader/DSSMReader.vcxproj
+++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
index 8a422f187006..438c7daede3a 100644
--- a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
+++ b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
@@ -100,7 +100,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
@@ -115,7 +115,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -133,7 +133,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
@@ -152,7 +152,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
index de7772889858..fd8f9c343f67 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@@ -69,7 +69,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
@@ -87,7 +87,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/ImageReader/ImageReader.vcxproj b/Source/Readers/ImageReader/ImageReader.vcxproj
index b5061adafd76..7d3a3b01c636 100644
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@@ -75,7 +75,7 @@
       <OpenMPSupport>true</OpenMPSupport>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;$(OpenCVLib);%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
diff --git a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
index 24a8a11122be..93b527173fed 100644
--- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
@@ -71,7 +71,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -92,7 +92,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
index bb68dd89df85..a73d0af74088 100644
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
@@ -71,7 +71,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -92,7 +92,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
index e3a10c534203..e5d8ac1fb2b9 100644
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
index 72d18defe990..db66c6d311d5 100644
--- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
+++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
index fc0e03ffa999..e30dc6b90299 100644
--- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
+++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
@@ -70,7 +70,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/UCIReader/UCIReader.vcxproj b/Source/Readers/UCIReader/UCIReader.vcxproj
index 2e25c2b5779f..08cce8205b9b 100644
--- a/Source/Readers/UCIReader/UCIReader.vcxproj
+++ b/Source/Readers/UCIReader/UCIReader.vcxproj
@@ -91,7 +91,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -107,7 +107,7 @@
       <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -124,7 +124,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
@@ -144,7 +144,7 @@
       <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
index c7c9d407325b..b379735411ff 100644
--- a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
+++ b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
@@ -44,7 +44,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
   </ItemDefinitionGroup>

From 19a9895d1a1c49378feb3986a4ee1fcf7b98b9e5 Mon Sep 17 00:00:00 2001
From: Yongqiang Wang <erw@microsoft.com>
Date: Wed, 13 Jan 2016 12:25:20 -0800
Subject: [PATCH 46/49] Print SeqGammar related parameters for better logging.

---
 Source/ComputationNetworkLib/ComputationNetwork.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 8e1019c25722..8f1bf6fae73e 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -635,6 +635,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                             )
     {
         fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh);
+        fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n",
+            amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false");
         list<ComputationNodeBasePtr> seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode);
         if (seqNodes.size() == 0)
         {

From d39d87f03bd1a6d44c7ec6aae863bc09643eb9d7 Mon Sep 17 00:00:00 2001
From: thhoens <thhoens@microsoft.com>
Date: Sat, 9 Jan 2016 04:24:23 -0800
Subject: [PATCH 47/49] Fixed a bug where the m_elemSizeAllocated was used
 instead of m_nz

---
 Source/Math/GPUSparseMatrix.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h
index 63234dabe9b3..3f125330a8d8 100644
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
         }
 
+		// TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated.
         size_t MajorIndexCount() const
         {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
         }
         size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
         { 
@@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 return MajorIndexLocation() + m_numRows;
             else
                 return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
+                //return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
         } 
         size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
         {

From f835efd05234fa5ed088dec1ab1744e22a9402cf Mon Sep 17 00:00:00 2001
From: thhoens <thhoens@microsoft.com>
Date: Tue, 12 Jan 2016 16:01:29 -0800
Subject: [PATCH 48/49] Fix for multi GPU to share all parameters required to
 adjust learning rate.

---
 Source/SGDLib/SGD.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 11290caa33cb..039721a108c5 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -513,6 +513,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
             {
                 g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
+                g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
             }
 
             bool loadedPrevModel = false;

From de5be29239fc7553fe050cd02413e8045b0dbad7 Mon Sep 17 00:00:00 2001
From: thhoens <thhoens@microsoft.com>
Date: Tue, 12 Jan 2016 16:23:04 -0800
Subject: [PATCH 49/49] Minor performance upgrade on row slicing to avoid GPU
 mem copy.

---
 Source/Math/GPUSparseMatrix.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
index 7e4f7a1c6fb0..3d4635020a88 100644
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         slice.m_computeDevice = m_computeDevice;
         slice.m_numRows = m_numRows;
         slice.m_numCols = numCols;
-        slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
+        slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
         slice.m_elemSizeAllocated = m_elemSizeAllocated;
         slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
         slice.m_pArray = m_pArray;