From 6f59eb1d2262468ef98b8d129943defea2231c16 Mon Sep 17 00:00:00 2001
From: Mark Hillebrand <Mark.Hillebrand@microsoft.com>
Date: Fri, 22 Jan 2016 09:23:03 +0100
Subject: [PATCH] Formatting

---
 .../CNTK/BrainScript/BrainScriptEvaluator.cpp |    2 +-
 .../CNTK/BrainScript/BrainScriptEvaluator.h   |    2 +-
 Source/CNTK/BrainScript/BrainScriptParser.cpp |    2 +-
 Source/CNTK/BrainScript/BrainScriptParser.h   |    2 +-
 Source/CNTK/BrainScript/BrainScriptTest.cpp   |    2 +-
 Source/CNTK/ModelEditLanguage.cpp             |    2 +-
 Source/CNTK/NDLUtil.h                         |    4 +-
 Source/Common/BestGpu.cpp                     |    4 +-
 Source/Common/Config.cpp                      |    4 +-
 Source/Common/DebugUtil.cpp                   |    4 +-
 Source/Common/Include/ProgressTracing.h       |    1 -
 Source/Common/Include/ScriptableObjects.h     |    2 +-
 Source/Common/Include/TensorShape.h           |    9 +-
 Source/Common/Include/latticesource.h         |    3 +-
 .../CompositeComputationNodes.h               | 1050 ++---
 .../ComputationNetwork.cpp                    |    1 -
 .../ComputationNetwork.h                      |   70 +-
 .../ComputationNetworkEvaluation.cpp          |    4 +-
 .../ComputationNetworkLib/ComputationNode.cpp |   14 +-
 .../ComputationNetworkLib/ComputationNode.h   | 2110 ++++-----
 .../ConvolutionalNodes.h                      |    8 +-
 Source/ComputationNetworkLib/EsotericNodes.h  | 3789 ++++++++--------
 .../EvaluationCriterionNodes.h                |    1 -
 .../InputAndParamNodes.h                      |   22 +-
 .../LinearAlgebraNodes.h                      |   39 +-
 .../ComputationNetworkLib/NonlinearityNodes.h |    4 +-
 Source/ComputationNetworkLib/RecurrentNodes.h |   52 +-
 Source/ComputationNetworkLib/ReshapingNodes.h |    5 +-
 .../TrainingCriterionNodes.h                  |   10 +-
 Source/EvalDll/CNTKEval.h                     |    4 +-
 Source/Math/CPUMatrix.cpp                     |    4 +-
 Source/Math/CommonMatrix.h                    |   11 +-
 Source/Math/GPUMatrix.cu                      |   29 +-
 Source/Math/GPUMatrix.h                       |    8 +-
 Source/Math/MatrixQuantizerCPU.h              |    4 +-
 Source/Math/NoGPU.cpp                         |    1 -
 Source/Math/QuantizedMatrix.h                 |    4 +-
 Source/Readers/BinaryReader/BinaryFile.cpp    |    4 +-
 Source/Readers/BinaryReader/BinaryReader.h    |    4 +-
 Source/Readers/BinaryReader/BinaryWriter.cpp  |    4 +-
 Source/Readers/BinaryReader/Exports.cpp       |    4 +-
 Source/Readers/HTKMLFReader/DataReader.cpp    |    4 +-
 Source/Readers/ImageReader/Exports.cpp        |    4 +-
 Source/Readers/Kaldi2Reader/htkfeatio_utils.h |    4 +-
 Source/Readers/LMSequenceReader/Exports.cpp   |    4 +-
 Source/Readers/LUSequenceReader/Exports.cpp   |    4 +-
 .../LUSequenceReader/LUSequenceWriter.cpp     |    4 +-
 .../LUSequenceReader/LUSequenceWriter.h       |    4 +-
 Source/Readers/UCIFastReader/Exports.cpp      |    4 +-
 Source/SGDLib/DataReaderHelpers.h             |  828 ++--
 Source/SGDLib/IDistGradAggregator.h           |    1 +
 Source/SGDLib/MultiNetworksEvaluator.h        |   18 +-
 Source/SGDLib/MultiNetworksSGD.h              |    4 +-
 Source/SGDLib/SGD.cpp                         | 3918 ++++++++---------
 Tests/UnitTests/FileTest/FileTest.cpp         |    4 +-
 Tests/UnitTests/FileTest/FileTest.h           |    4 +-
 Tests/UnitTests/MathTests/DebugUtil.cpp       |    4 +-
 .../ReaderTests/Common/ReaderTestHelper.h     |    4 +-
 58 files changed, 6060 insertions(+), 6060 deletions(-)
diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
index c7c012540215..4f8efaeff055 100644
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@@ -1022,4 +1022,4 @@ static ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<Debug> registerDe
 //     - macro arg expressions get their path assigned when their thunk is created, the thunk remembers it
 //     - however, really, the thunk should get the expression path from the context it is executed in, not the context it was created in
 //     - maybe there is some clever scheme of overwriting when a result comes back? E.g. we retrieve a value but its name is not right, can we patch it up? Very tricky to find the right rules/conditions
-} } } // namespaces
+} } } // namespaces
diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.h b/Source/CNTK/BrainScript/BrainScriptEvaluator.h
index df754e5421ab..8e166296bee7 100644
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.h
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.h
@@ -24,4 +24,4 @@ shared_ptr<Object> EvaluateField(ExpressionPtr e, const wstring& id); // for exp
 
 // some simple tests
 void SomeTests();
-} } } // end namespaces
+} } } // end namespaces
diff --git a/Source/CNTK/BrainScript/BrainScriptParser.cpp b/Source/CNTK/BrainScript/BrainScriptParser.cpp
index 7ec391d33310..63959aa0e3ee 100644
--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@@ -952,4 +952,4 @@ ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector<wstring>&&
     parser.VerifyAtEnd();
     return expr;
 }
-} } } // namespaces
+} } } // namespaces
diff --git a/Source/CNTK/BrainScript/BrainScriptParser.h b/Source/CNTK/BrainScript/BrainScriptParser.h
index 6475d350c0ce..974be5546f37 100644
--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@@ -137,4 +137,4 @@ typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular def
 ExpressionPtr ParseConfigDictFromString(wstring text, vector<wstring>&& includePaths);          // parses a list of dictionary members, returns a dictionary expression
 ExpressionPtr ParseConfigDictFromFile(wstring path, vector<wstring>&& includePaths);            // likewise, but from a file path
 ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector<wstring>&& includePaths); // parses a single expression from sourceText, which is meant to contain an include statement, hence includePaths
-} } } // namespaces
+} } } // namespaces
diff --git a/Source/CNTK/BrainScript/BrainScriptTest.cpp b/Source/CNTK/BrainScript/BrainScriptTest.cpp
index c2b0626faa51..dcc7979617a3 100644
--- a/Source/CNTK/BrainScript/BrainScriptTest.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptTest.cpp
@@ -190,4 +190,4 @@ void SomeTests()
         err.PrintError();
     }
 }
-} } } // namespaces
+} } } // namespaces
diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index 35b075102638..f17ee0ab9bde 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -138,7 +138,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams);
 
         auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
-#if 1   // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
+#if 1 // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
         if (modelFormat == L"cntk_legacy_no_tensorlib")
         {
             cn->Read<ElemType>(params[1]);
diff --git a/Source/CNTK/NDLUtil.h b/Source/CNTK/NDLUtil.h
index 1a9060362995..d3da7fcdcaee 100644
--- a/Source/CNTK/NDLUtil.h
+++ b/Source/CNTK/NDLUtil.h
@@ -173,6 +173,4 @@ class NDLUtil
 
 template class NDLUtil<float>;
 template class NDLUtil<double>;
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Common/BestGpu.cpp b/Source/Common/BestGpu.cpp
index f7a3506fa57e..8208b64b9378 100644
--- a/Source/Common/BestGpu.cpp
+++ b/Source/Common/BestGpu.cpp
@@ -265,9 +265,9 @@ void BestGpu::Init()
     // get the count of objects
     cudaError_t err = cudaGetDeviceCount(&m_deviceCount);
     if (err != cudaSuccess)
-        m_deviceCount = 0;       // if this fails, we have no GPUs
+        m_deviceCount = 0; // if this fails, we have no GPUs
 
-    ProcessorData pdEmpty = { 0 };
+    ProcessorData pdEmpty = {0};
     for (int i = 0; i < m_deviceCount; i++)
     {
         ProcessorData* data = new ProcessorData();
diff --git a/Source/Common/Config.cpp b/Source/Common/Config.cpp
index 52cf00dfbefc..da713c83588f 100644
--- a/Source/Common/Config.cpp
+++ b/Source/Common/Config.cpp
@@ -304,6 +304,4 @@ void TrimQuotes(std::string& str)
     if (str.front() == '"' && str.back() == '"')
         str = str.substr(1, str.size() - 2);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Common/DebugUtil.cpp b/Source/Common/DebugUtil.cpp
index 65c7b397fe00..cc769930ad8b 100644
--- a/Source/Common/DebugUtil.cpp
+++ b/Source/Common/DebugUtil.cpp
@@ -136,6 +136,4 @@ void DebugUtil::PrintCallStack()
     free(symbolList);
 #endif
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Common/Include/ProgressTracing.h b/Source/Common/Include/ProgressTracing.h
index e5bd15f98be6..b7723f2c6f5f 100644
--- a/Source/Common/Include/ProgressTracing.h
+++ b/Source/Common/Include/ProgressTracing.h
@@ -44,7 +44,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     } // wrap static state in an accessor, so we won't need a CPP file
 
 public:
-
     static bool IsEnabled()
     {
         return GetStaticInstance().m_enabled;
diff --git a/Source/Common/Include/ScriptableObjects.h b/Source/Common/Include/ScriptableObjects.h
index 9f1755b4b7bc..6cb065999418 100644
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@@ -896,4 +896,4 @@ template <class V>
 {
     return static_cast<const std::vector<typename V::value_type> &>(vec);
 } // use this specifically for XXXargvector
-} } } // end namespaces
+} } } // end namespaces
diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h
index 3bd3119fce71..ebbb47059fc8 100644
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@@ -383,7 +383,10 @@ struct TensorShape
     {
         return m_dims == other.m_dims;
     }
-    bool operator!=(const TensorShape& other) const { return !operator==(other); }  // duh!
+    bool operator!=(const TensorShape& other) const
+    {
+        return !operator==(other);
+    } // duh!
 
     // verify that this refers to a dense matrix (no strides)
     void VerifyIsDense() const
@@ -622,13 +625,13 @@ struct TensorShape
     }
 
     // compare two TensorShapes, whether they are compatible, considering padding and broadcasting
-    bool IsElementwiseCompatibleWith(const TensorShape & other) const
+    bool IsElementwiseCompatibleWith(const TensorShape& other) const
     {
         for (size_t i = 0; i < m_dims.size(); i++)
         {
             size_t dim = m_dims[i];
             size_t otherDim = i < other.size() ? other[i] : 1;
-            if (dim != otherDim && dim != 1 && otherDim != 1)   // dims mismatch, and neither is broadcasting
+            if (dim != otherDim && dim != 1 && otherDim != 1) // dims mismatch, and neither is broadcasting
                 return false;
         }
         return true;
diff --git a/Source/Common/Include/latticesource.h b/Source/Common/Include/latticesource.h
index c21832da549a..7b72d0b813d4 100644
--- a/Source/Common/Include/latticesource.h
+++ b/Source/Common/Include/latticesource.h
@@ -76,5 +76,4 @@ class latticesource
         denlattices.setverbosity(veb);
     }
 };
-}
-}
\ No newline at end of file
+} }
\ No newline at end of file
diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 46d074832ab2..59431685d7ce 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <stdexcept>
 #include <list>
-#include <iostream> 
+#include <iostream>
 
 //this file will contain computation nodes that require several atomic computation.
 //composite nodes can save memory, computation, or both
@@ -31,172 +31,178 @@ class PreComputedNodeBase : public ComputationNodeNonLooping /*ComputationNode*/
     UsingComputationNodeMembers;
     using Base::OperationName;
 
-    public:
-        PreComputedNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
-            : Base(deviceId, name), m_hasComputed(false)
-        {
-        }
+public:
+    PreComputedNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name), m_hasComputed(false)
+    {
+    }
 
-        // interface through which this node is operated on are these two functions
+    // interface through which this node is operated on are these two functions
 
-        // check whether node has already undergone precomputation
-        virtual bool HasComputed() const { return m_hasComputed; }
+    // check whether node has already undergone precomputation
+    virtual bool HasComputed() const
+    {
+        return m_hasComputed;
+    }
 
-        // call this with 'false' at start and with 'true' at end
-        // This is used for resetting and updating from accumulators.
-        virtual void MarkComputed(const bool hasComputed)
-        {
-            m_hasComputed = hasComputed;
-            CreateMatrixIfNull(m_value);
-        }
+    // call this with 'false' at start and with 'true' at end
+    // This is used for resetting and updating from accumulators.
+    virtual void MarkComputed(const bool hasComputed)
+    {
+        m_hasComputed = hasComputed;
+        CreateMatrixIfNull(m_value);
+    }
 
-        virtual bool RequiresPreCompute() const override { return true; }
+    virtual bool RequiresPreCompute() const override
+    {
+        return true;
+    }
 
-        virtual void Save(File& fstream) const override
-        {
-            Base::Save(fstream);
-            fstream << m_hasComputed;
-            fstream << Value();
-        }
+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_hasComputed;
+        fstream << Value();
+    }
 
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            Base::Load(fstream, modelVersion);
-            fstream >> m_hasComputed;
-            LoadValue(fstream);
-            // Note: This loses the sample layout, but that is recovered by Validate().
-        }
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        fstream >> m_hasComputed;
+        LoadValue(fstream);
+        // Note: This loses the sample layout, but that is recovered by Validate().
+    }
 
-        virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
-        {
-            Base::DumpNodeInfo(printValues, fstream);
+    virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
+    {
+        Base::DumpNodeInfo(printValues, fstream);
 
-            char str[4096];
-            sprintf(str, "[%s]  ", string(GetSampleLayout()).c_str());
-            fstream << string(str);
-            sprintf(str, "HasComputed=%ls", HasComputed() ? L"true" : L"false");
-            fstream << string(str);
+        char str[4096];
+        sprintf(str, "[%s]  ", string(GetSampleLayout()).c_str());
+        fstream << string(str);
+        sprintf(str, "HasComputed=%ls", HasComputed() ? L"true" : L"false");
+        fstream << string(str);
 
-            PrintNodeValuesToFile(printValues, fstream);
-        }
+        PrintNodeValuesToFile(printValues, fstream);
+    }
 
-        virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            if (!Input(0)->HasMBLayout())
-                InvalidArgument("%ls %ls operation requires its input to come in minibatches of samples.", NodeName().c_str(), OperationName().c_str());
-            m_pMBLayout = nullptr; // this node does not hold mini-batch data
-
-            if (!m_hasComputed) // this node retains state, and state gets destroyed by Resize(), so we must be careful
-                SetDims(Input(0)->GetSampleLayout(), false);
-            else if (!GetSampleLayout().IsElementwiseCompatibleWith(Input(0)->GetSampleLayout()))
-                InvalidArgument("%ls %ls operation: Precomputed parameter does not match input dimensions.", NodeName().c_str(), OperationName().c_str());
-        }
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        if (!Input(0)->HasMBLayout())
+            InvalidArgument("%ls %ls operation requires its input to come in minibatches of samples.", NodeName().c_str(), OperationName().c_str());
+        m_pMBLayout = nullptr; // this node does not hold mini-batch data
+
+        if (!m_hasComputed) // this node retains state, and state gets destroyed by Resize(), so we must be careful
+            SetDims(Input(0)->GetSampleLayout(), false);
+        else if (!GetSampleLayout().IsElementwiseCompatibleWith(Input(0)->GetSampleLayout()))
+            InvalidArgument("%ls %ls operation: Precomputed parameter does not match input dimensions.", NodeName().c_str(), OperationName().c_str());
+    }
 
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = dynamic_pointer_cast<PreComputedNodeBase<ElemType>>(nodeP);
-                node->m_hasComputed = m_hasComputed;
-            }
+            auto node = dynamic_pointer_cast<PreComputedNodeBase<ElemType>>(nodeP);
+            node->m_hasComputed = m_hasComputed;
         }
+    }
 
-        // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
-        virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
-        {
-            if (value.GetNumCols() != 1)
-                InvalidArgument("SideLoadFromMatrix: Side-loading is only supported for column vectors.");
-            CreateMatrixIfNull(m_value);
-            m_value->SetValue(value);
-            m_hasComputed = true; 
-            SetDims(TensorShape(value.GetNumRows()), false);
-        }
+    // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
+    virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
+    {
+        if (value.GetNumCols() != 1)
+            InvalidArgument("SideLoadFromMatrix: Side-loading is only supported for column vectors.");
+        CreateMatrixIfNull(m_value);
+        m_value->SetValue(value);
+        m_hasComputed = true;
+        SetDims(TensorShape(value.GetNumRows()), false);
+    }
 
-    public:
-        bool m_hasComputed;
-    };
+public:
+    bool m_hasComputed;
+};
 
 #define UsingPreComputedNodeMembers \
     UsingComputationNodeMembers;    \
     using Base::m_hasComputed;      \
     using Base::OperationName
 
-    // -----------------------------------------------------------------------
-    // MeanInvStdDevNodeBase (features)  -- common base class for Mean and InvStdDev
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// MeanInvStdDevNodeBase (features)  -- common base class for Mean and InvStdDev
+// -----------------------------------------------------------------------
 
 template <class ElemType>
 class MeanInvStdDevNodeBase : public PreComputedNodeBase<ElemType>, public NumInputs<1>
 {
     typedef PreComputedNodeBase<ElemType> Base;
     UsingPreComputedNodeMembers;
-        //static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; }
-    public:
-        //DeclareConstructorFromConfigWithNumInputs(MeanInvStdDevNodeBase);
+    //static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; }
+public:
+    //DeclareConstructorFromConfigWithNumInputs(MeanInvStdDevNodeBase);
     MeanInvStdDevNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
         : PreComputedNodeBase<ElemType>(deviceId, name),
-            m_numSamples(SIZE_MAX)
+          m_numSamples(SIZE_MAX)
     {
     }
 
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            Base::Load(fstream, modelVersion);
-            m_numSamples = SIZE_MAX;
-        }
-    
-        // this is used by convertDBN
-        virtual void SideLoadFromMatrix(const Matrix<ElemType>& m)
-        {
-            Base::SideLoadFromMatrix(m);
-            m_numSamples = SIZE_MAX;
-        }
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        m_numSamples = SIZE_MAX;
+    }
+
+    // this is used by convertDBN
+    virtual void SideLoadFromMatrix(const Matrix<ElemType>& m)
+    {
+        Base::SideLoadFromMatrix(m);
+        m_numSamples = SIZE_MAX;
+    }
 
     virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed, size_t numSamples = 0)
-        {
-            Base::MarkComputed(hasComputed);
+    {
+        Base::MarkComputed(hasComputed);
         if (!m_hasComputed) // initialize
-            {
-                if (IsAccumulating())
-                    LogicError("%ls %ls operation: MarkComputed(false) has been called while accumulating.", NodeName().c_str(), OperationName().c_str());
-                m_numSamples = 0;
-            }
-        else // finalize
-            {
-                if (!IsAccumulating())
-                    LogicError("%ls %ls operation: MarkComputed(true) has been called without MarkComputed(false) first.", NodeName().c_str(), OperationName().c_str());
-                if (m_numSamples == 0)
-                    LogicError("%ls %ls operation: No data accumulated during precomputation.", NodeName().c_str(), OperationName().c_str());
-                m_numSamples = SIZE_MAX;
-            }
+        {
+            if (IsAccumulating())
+                LogicError("%ls %ls operation: MarkComputed(false) has been called while accumulating.", NodeName().c_str(), OperationName().c_str());
+            m_numSamples = 0;
         }
-
-        virtual void BackpropToNonLooping(size_t /*inputIndex*/) override
+        else // finalize
         {
-            //LogicError("Mean operation should not be involved in the gradient calculation.");
+            if (!IsAccumulating())
+                LogicError("%ls %ls operation: MarkComputed(true) has been called without MarkComputed(false) first.", NodeName().c_str(), OperationName().c_str());
+            if (m_numSamples == 0)
+                LogicError("%ls %ls operation: No data accumulated during precomputation.", NodeName().c_str(), OperationName().c_str());
+            m_numSamples = SIZE_MAX;
         }
+    }
 
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    virtual void BackpropToNonLooping(size_t /*inputIndex*/) override
+    {
+        //LogicError("Mean operation should not be involved in the gradient calculation.");
+    }
+
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                if (m_numSamples != SIZE_MAX)
-                    LogicError("%ls %ls operation: CopyTo() called while accumulating.", NodeName().c_str(), OperationName().c_str());
-                auto node = dynamic_pointer_cast<MeanInvStdDevNodeBase<ElemType>>(nodeP);
-                node->m_numSamples = SIZE_MAX;
-            }
+            if (m_numSamples != SIZE_MAX)
+                LogicError("%ls %ls operation: CopyTo() called while accumulating.", NodeName().c_str(), OperationName().c_str());
+            auto node = dynamic_pointer_cast<MeanInvStdDevNodeBase<ElemType>>(nodeP);
+            node->m_numSamples = SIZE_MAX;
         }
+    }
 
-    protected:
+protected:
     size_t m_numSamples; // (SIZE_MAX while outside accumulation state)
     bool IsAccumulating() const
     {
         return m_numSamples != SIZE_MAX;
     }
-    };
+};
 
 #define UsingMeanInvStdDevNodeBaseNodeMembers \
     ComputationNodeBoilerplate;               \
@@ -204,13 +210,13 @@ class MeanInvStdDevNodeBase : public PreComputedNodeBase<ElemType>, public NumIn
     using Base::m_numSamples;                 \
     using Base::IsAccumulating
 
-    // -----------------------------------------------------------------------
-    // MeanNode (features)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// MeanNode (features)
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class MeanNode : public MeanInvStdDevNodeBase<ElemType>
-    {
+class MeanNode : public MeanInvStdDevNodeBase<ElemType>
+{
     typedef MeanInvStdDevNodeBase<ElemType> Base;
     UsingMeanInvStdDevNodeBaseNodeMembers;
     static const std::wstring TypeName()
@@ -218,72 +224,72 @@ template <class ElemType>
         return L"Mean";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(MeanNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(MeanNode);
     MeanNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
-        {
+    {
     }
 
     MeanNode(DEVICEID_TYPE deviceId, const wstring& name, size_t)
         : Base(deviceId, name)
     {
-        }
+    }
     virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed)
-        {
-            Base::MarkComputed(hasComputed);
+    {
+        Base::MarkComputed(hasComputed);
         if (!m_hasComputed) // initialize accumulation
-            {
-                UpdateFunctionValuesSize();
-                Value().SetValue(0);
-            }
-            // no else branch because ForwardPropNonLooping() already leaves a valid mean in m_value
+        {
+            UpdateFunctionValuesSize();
+            Value().SetValue(0);
         }
+        // no else branch because ForwardPropNonLooping() already leaves a valid mean in m_value
+    }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
-        {
-            FrameRange fr(Input(0)->GetMBLayout());
-            if (m_hasComputed)
+    {
+        FrameRange fr(Input(0)->GetMBLayout());
+        if (m_hasComputed)
             return; // not accumulating
 
-            if (!IsAccumulating())
-                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
+        if (!IsAccumulating())
+            LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
 
-            // set gaps to zero, since we are reducing in time
-            Input(0)->MaskMissingValueColumnsToZero(fr);
+        // set gaps to zero, since we are reducing in time
+        Input(0)->MaskMissingValueColumnsToZero(fr);
 
         auto& samples = Input(0)->Value();
         auto& avg = Value();
 
 #if NANCHECK
-            samples.HasNan("Mean-Samples");
+        samples.HasNan("Mean-Samples");
 #endif
-            size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
-            size_t totalNumSamples = m_numSamples + numNewSamples;
+        size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
+        size_t totalNumSamples = m_numSamples + numNewSamples;
         if (totalNumSamples == 0)
             totalNumSamples = 1; // 0/0=1 in this context
-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
-                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
+        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
+                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, avg);
 #if NANCHECK
-            avg.HasNan("Mean-avg");
+        avg.HasNan("Mean-avg");
 #endif
 
-            m_numSamples += numNewSamples;
-        }
-    };
+        m_numSamples += numNewSamples;
+    }
+};
 
-    template class MeanNode<float>;
-    template class MeanNode<double>;
+template class MeanNode<float>;
+template class MeanNode<double>;
 
-    // -----------------------------------------------------------------------
-    // InvStdDevNode (features)
-    // TODO: share stuff with MeanNode
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// InvStdDevNode (features)
+// TODO: share stuff with MeanNode
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class InvStdDevNode : public MeanInvStdDevNodeBase<ElemType>
-    {
+class InvStdDevNode : public MeanInvStdDevNodeBase<ElemType>
+{
     typedef MeanInvStdDevNodeBase<ElemType> Base;
     UsingMeanInvStdDevNodeBaseNodeMembers;
     static const std::wstring TypeName()
@@ -291,8 +297,8 @@ template <class ElemType>
         return L"InvStdDev";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(InvStdDevNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(InvStdDevNode);
     InvStdDevNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name),
           m_mean(deviceId),
@@ -302,116 +308,116 @@ template <class ElemType>
     }
 
     virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed) override
-        {
-            Base::MarkComputed(hasComputed);
+    {
+        Base::MarkComputed(hasComputed);
 
-            if (!m_hasComputed) // initialize
-            {
-                // reset accumulators
-                size_t inputDim = Input(0)->GetSampleMatrixNumRows();
-                m_mean.Resize(inputDim, 1);
-                m_var.Resize(inputDim, 1);
-                m_mean.SetValue(0);
-                m_var.SetValue(0);
-                UpdateFunctionValuesSize();
+        if (!m_hasComputed) // initialize
+        {
+            // reset accumulators
+            size_t inputDim = Input(0)->GetSampleMatrixNumRows();
+            m_mean.Resize(inputDim, 1);
+            m_var.Resize(inputDim, 1);
+            m_mean.SetValue(0);
+            m_var.SetValue(0);
+            UpdateFunctionValuesSize();
             Value().SetValue(0); // also set this because not doing it may flag during debugging; avoids special-casing this
-            }
+        }
         else // finalize
-            {
-                ElemType sqrtFloor = 1e-10f;
+        {
+            ElemType sqrtFloor = 1e-10f;
             m_var.InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy)
 #if NANCHECK
-                m_var.HasNan("MarkComputed-InplaceTruncateBottom");
+            m_var.HasNan("MarkComputed-InplaceTruncateBottom");
 #endif
-                m_var.InplaceSqrt();
+            m_var.InplaceSqrt();
 
 #if NANCHECK
-                m_var.HasNan("MarkComputed-InplaceSqrt");
+            m_var.HasNan("MarkComputed-InplaceSqrt");
 #endif
-                m_var.ElementInverse();
+            m_var.ElementInverse();
 
 #if NANCHECK
-                m_var.HasNan("MarkComputed-ElementInverse()");
+            m_var.HasNan("MarkComputed-ElementInverse()");
 #endif
-                Value().SetValue(m_var);
-            }
+            Value().SetValue(m_var);
         }
+    }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
-        {
-            FrameRange fr(Input(0)->GetMBLayout());
-            if (m_hasComputed)
+    {
+        FrameRange fr(Input(0)->GetMBLayout());
+        if (m_hasComputed)
             return; // not accumulating
 
-            if (!IsAccumulating())
-                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
+        if (!IsAccumulating())
+            LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
 
-            // set gaps to zero, since we are reducing in time
-            Input(0)->MaskMissingValueColumnsToZero(fr);
+        // set gaps to zero, since we are reducing in time
+        Input(0)->MaskMissingValueColumnsToZero(fr);
 
         auto& samples = Input(0)->Value();
 #if NANCHECK
-            samples.HasNan("InvStdDev-Samples");
+        samples.HasNan("InvStdDev-Samples");
 #endif
-            m_temp.SetValue(m_mean);
-            size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
-            size_t totalNumSamples = m_numSamples + numNewSamples;
+        m_temp.SetValue(m_mean);
+        size_t numNewSamples = Input(0)->GetMBLayout()->GetActualNumSamples();
+        size_t totalNumSamples = m_numSamples + numNewSamples;
         if (totalNumSamples == 0)
             totalNumSamples = 1; // 0/0=1 in this context
-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
-                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
+        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
+                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, m_mean);
 
-            m_temp -= m_mean;
-            m_temp.AssignElementPowerOf(m_temp, 2);
-            m_var += m_temp;
+        m_temp -= m_mean;
+        m_temp.AssignElementPowerOf(m_temp, 2);
+        m_var += m_temp;
 
-            m_temp.AssignDifferenceOf(samples, m_mean);
-            m_temp.AssignElementPowerOf(m_temp, 2);
+        m_temp.AssignDifferenceOf(samples, m_mean);
+        m_temp.AssignElementPowerOf(m_temp, 2);
 
-            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, m_temp, false,
-                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
+        Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, m_temp, false,
+                                                 ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, m_var);
 
 #if NANCHECK
-            m_var.HasNan("InvStdDev-m_var");
+        m_var.HasNan("InvStdDev-m_var");
 #endif
 
-#if 0       // BUGBUG: This is the correct version, but it will break test cases, so do this later. MeanNode does it right already.
+#if 0 // BUGBUG: This is the correct version, but it will break test cases, so do this later. MeanNode does it right already.
             m_numSamples += Input(0)->GetMBLayout()->GetActualNumSamples();
 #else
-            m_numSamples += Input(0)->Value().GetNumCols();  // BUGBUG: Should be -> GetActualNumSamples().
+        m_numSamples += Input(0)->Value().GetNumCols(); // BUGBUG: Should be -> GetActualNumSamples().
 #endif
-        }
+    }
 
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = dynamic_pointer_cast<InvStdDevNode<ElemType>>(nodeP);
-                node->m_mean = m_mean;
-                node->m_var = m_var;
+            auto node = dynamic_pointer_cast<InvStdDevNode<ElemType>>(nodeP);
+            node->m_mean = m_mean;
+            node->m_var = m_var;
             node->m_temp = m_temp;
-            }
         }
+    }
 
-    private:
-        Matrix<ElemType> m_mean;
-        Matrix<ElemType> m_var;
-        Matrix<ElemType> m_temp;
-    };
+private:
+    Matrix<ElemType> m_mean;
+    Matrix<ElemType> m_var;
+    Matrix<ElemType> m_temp;
+};
 
-    template class InvStdDevNode<float>;
-    template class InvStdDevNode<double>;
+template class InvStdDevNode<float>;
+template class InvStdDevNode<double>;
 
-    // -----------------------------------------------------------------------
-    // PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
-    {
+class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -419,103 +425,103 @@ template <class ElemType>
         return L"PerDimMeanVarNormalization";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
     PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
-        {
-            InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage.");
-        }
+    {
+        InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage.");
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            // only feature (input0) and output needs to be sliced
-            auto sliceInput0Value = Input(0)->ValueFor(fr);
-            auto sliceOutputValue = ValueFor(fr);
+    {
+        // only feature (input0) and output needs to be sliced
+        auto sliceInput0Value = Input(0)->ValueFor(fr);
+        auto sliceOutputValue = ValueFor(fr);
 
-            ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
-        }
+        ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
+    }
 
     /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0,
-                                             const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
-        {
+                                                     const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
+    {
 #if DUMPOUTPUT
-            //input0.Print("PerDimMeanVarNormalization-input0");
-            //input1.Print("PerDimMeanVarNormalization-input1");
-            //input2.Print("PerDimMeanVarNormalization-input2");
+//input0.Print("PerDimMeanVarNormalization-input0");
+//input1.Print("PerDimMeanVarNormalization-input1");
+//input2.Print("PerDimMeanVarNormalization-input2");
 #endif
 
 #if NANCHECK
-            input0.HasNan("PerDimMeanVarNormalization-input0");
-            input1.HasNan("PerDimMeanVarNormalization-input1");
-            input2.HasNan("PerDimMeanVarNormalization-input2");
+        input0.HasNan("PerDimMeanVarNormalization-input0");
+        input1.HasNan("PerDimMeanVarNormalization-input1");
+        input2.HasNan("PerDimMeanVarNormalization-input2");
 #endif
-            functionValues.AssignDifferenceOf(input0, input1);
-            functionValues.ColumnElementMultiplyWith(input2);
+        functionValues.AssignDifferenceOf(input0, input1);
+        functionValues.ColumnElementMultiplyWith(input2);
 #if NANCHECK
-            functionValues.HasNan("PerDimMeanVarNormalization");
+        functionValues.HasNan("PerDimMeanVarNormalization");
 #endif
 #if DUMPOUTPUT
-            functionValues.Print("PerDimMeanVarNormalizationNode");
+        functionValues.Print("PerDimMeanVarNormalizationNode");
 #endif
-        }
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            if (Input(0)->RequiresPreCompute())
-            {
-                LogicError(
-                    "PerDimMeanVarNormalizationNode criterion forbids first input from being a pre-compute node. "
-                    "The first input should be the node whose output should be normalized, and the second and third inputs "
-                    "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-            }
+        if (Input(0)->RequiresPreCompute())
+        {
+            LogicError(
+                "PerDimMeanVarNormalizationNode criterion forbids first input from being a pre-compute node. "
+                "The first input should be the node whose output should be normalized, and the second and third inputs "
+                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
+        }
 
-            if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
-                  Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
-                !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
-                  Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
-            {
-                LogicError(
-                    "PerDimMeanVarNormalizationNode criterion requires the last two inputs to be LearnableParameter "
-                    "type or (Mean, InvStdDev) so that the values will be saved.");
-            }
+        if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
+              Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
+            !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
+              Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
+        {
+            LogicError(
+                "PerDimMeanVarNormalizationNode criterion requires the last two inputs to be LearnableParameter "
+                "type or (Mean, InvStdDev) so that the values will be saved.");
+        }
 
-            Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
-            Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
 
-            if (isFinalValidationPass)
-            {
-                if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
-                    InvalidArgument("PerDimMeanVarNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
-                if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
-                    InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
-            }
+        if (isFinalValidationPass)
+        {
+            if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
+                InvalidArgument("PerDimMeanVarNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
+            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
+                InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
+        }
 
-            // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
-            Input(1)->SetParameterUpdateRequired(false); // prevent learning
-            Input(2)->SetParameterUpdateRequired(false);
+        // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
+        Input(1)->SetParameterUpdateRequired(false); // prevent learning
+        Input(2)->SetParameterUpdateRequired(false);
 
-            SetDims(Input(0));
-        }
-    };
+        SetDims(Input(0));
+    }
+};
 
-    template class PerDimMeanVarNormalizationNode<float>;
-    template class PerDimMeanVarNormalizationNode<double>;
+template class PerDimMeanVarNormalizationNode<float>;
+template class PerDimMeanVarNormalizationNode<double>;
 
-    // -----------------------------------------------------------------------
-    // PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class PerDimMeanVarDeNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
-    {
+class PerDimMeanVarDeNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -523,109 +529,109 @@ template <class ElemType>
         return L"PerDimMeanVarDeNormalization";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarDeNormalizationNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarDeNormalizationNode);
     PerDimMeanVarDeNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
-        {
-            InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage.");
-        }
+    {
+        InvalidArgument("PerDimMeanVarDeNormalizationNode should only be called in the evaluation stage.");
+    }
 
-        //(feature-mean).*InvStdDev
+    //(feature-mean).*InvStdDev
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            // only feature (input0) and output needs to be sliced
-            auto sliceInput0Value = Input(0)->ValueFor(fr);
-            auto sliceOutputValue = ValueFor(fr);
+    {
+        // only feature (input0) and output needs to be sliced
+        auto sliceInput0Value = Input(0)->ValueFor(fr);
+        auto sliceOutputValue = ValueFor(fr);
 
-            ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
-        }
+        ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
+    }
 
     /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0,
-                                             const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
-        {
-    #if DUMPOUTPUT
-            //input0.Print("PerDimMeanVarDeNormalization-input0");
-            //input1.Print("PerDimMeanVarDeNormalization-input1");
-            //input2.Print("PerDimMeanVarDeNormalization-input2");
-    #endif
-
-    #if NANCHECK
-            input0.HasNan("PerDimMeanVarDeNormalization-input0");
-            input1.HasNan("PerDimMeanVarDeNormalization-input1");
-            input2.HasNan("PerDimMeanVarDeNormalization-input2");
-    #endif
-            //functionValues.AssignDifferenceOf(input0, input1);
-            //functionValues.ColumnElementMultiplyWith(input2);
-            //functionValues.AssignDifferenceOf(input0, input0);
-            //functionValues += input2;
-            //functionValues.ElementInverse();
-            //functionValues.ElementMultiplyWith(input0);
-            functionValues.SetValue(input0);
-            functionValues.ColumnElementDivideBy(input2);
-            functionValues += input1;
-    #if NANCHECK
-            functionValues.HasNan("PerDimMeanVarDeNormalization");
-    #endif
-    #if DUMPOUTPUT
-            functionValues.Print("PerDimMeanVarDeNormalizationNode");
-    #endif
-        }
+                                                     const Matrix<ElemType>& input1, const Matrix<ElemType>& input2)
+    {
+#if DUMPOUTPUT
+//input0.Print("PerDimMeanVarDeNormalization-input0");
+//input1.Print("PerDimMeanVarDeNormalization-input1");
+//input2.Print("PerDimMeanVarDeNormalization-input2");
+#endif
+
+#if NANCHECK
+        input0.HasNan("PerDimMeanVarDeNormalization-input0");
+        input1.HasNan("PerDimMeanVarDeNormalization-input1");
+        input2.HasNan("PerDimMeanVarDeNormalization-input2");
+#endif
+        //functionValues.AssignDifferenceOf(input0, input1);
+        //functionValues.ColumnElementMultiplyWith(input2);
+        //functionValues.AssignDifferenceOf(input0, input0);
+        //functionValues += input2;
+        //functionValues.ElementInverse();
+        //functionValues.ElementMultiplyWith(input0);
+        functionValues.SetValue(input0);
+        functionValues.ColumnElementDivideBy(input2);
+        functionValues += input1;
+#if NANCHECK
+        functionValues.HasNan("PerDimMeanVarDeNormalization");
+#endif
+#if DUMPOUTPUT
+        functionValues.Print("PerDimMeanVarDeNormalizationNode");
+#endif
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            if (Input(0)->RequiresPreCompute())
-            {
-                LogicError(
-                    "PerDimMeanVarDeNormalizationNode criterion forbids first input from being a pre-compute node. "
-                    "The first input should be the node whose output should be de-normalized, and the second and third inputs "
-                    "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-            }
+        if (Input(0)->RequiresPreCompute())
+        {
+            LogicError(
+                "PerDimMeanVarDeNormalizationNode criterion forbids first input from being a pre-compute node. "
+                "The first input should be the node whose output should be de-normalized, and the second and third inputs "
+                "should be LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
+        }
 
-            if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
-                  Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
-                !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
-                  Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
-            {
-                LogicError(
-                    "PerDimMeanVarDeNormalizationNode criterion requires the last two inputs to be "
-                    "LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
-            }
+        if (!(Input(1)->OperationName() == OperationNameOf(LearnableParameter) &&
+              Input(2)->OperationName() == OperationNameOf(LearnableParameter)) &&
+            !(Input(1)->OperationName() == OperationNameOf(MeanNode) &&
+              Input(2)->OperationName() == OperationNameOf(InvStdDevNode)))
+        {
+            LogicError(
+                "PerDimMeanVarDeNormalizationNode criterion requires the last two inputs to be "
+                "LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
+        }
 
-            Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
-            Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
 
-            if (isFinalValidationPass)
-            {
-                if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
-                    InvalidArgument("PerDimMeanVarDeNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
-                if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
-                    InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
-            }
+        if (isFinalValidationPass)
+        {
+            if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
+                InvalidArgument("PerDimMeanVarDeNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
+            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
+                InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
+        }
 
-            // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
-            Input(1)->SetParameterUpdateRequired(false); // prevent learning
-            Input(2)->SetParameterUpdateRequired(false);
+        // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
+        Input(1)->SetParameterUpdateRequired(false); // prevent learning
+        Input(2)->SetParameterUpdateRequired(false);
 
-            SetDims(Input(0));
-        }
-    };
+        SetDims(Input(0));
+    }
+};
 
-    template class PerDimMeanVarDeNormalizationNode<float>;
-    template class PerDimMeanVarDeNormalizationNode<double>;
+template class PerDimMeanVarDeNormalizationNode<float>;
+template class PerDimMeanVarDeNormalizationNode<double>;
 
-    // -----------------------------------------------------------------------
-    // BatchModeNode
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// BatchModeNode
+// -----------------------------------------------------------------------
 
-    /**
+/**
     BatchModeNode is a derivative of ComputationNode.
     It additionally check if needs to process data in batch before processing its parent
     This is used in case of beam search decoding. Batchmode node must be processed before other nodes.
@@ -634,85 +640,85 @@ template <class ElemType>
     */
 template <class ElemType>
 class BatchModeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
-    {
-        // all nodes require precomputation should derive from this class
+{
+    // all nodes require precomputation should derive from this class
     typedef ComputationNodeNonLooping<ElemType> Base;
     UsingComputationNodeMembers;
 
-    public:
-        //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
-        //DeclareConstructorFromConfigWithNumInputs(BatchModeNode);
+public:
+    //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
+    //DeclareConstructorFromConfigWithNumInputs(BatchModeNode);
     BatchModeNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name),
-            m_memory(deviceId)
+          m_memory(deviceId)
     {
     }
 
-        virtual bool HasComputed() const = 0;
-        virtual void MarkComputed(const bool hasComputed) = 0;
+    virtual bool HasComputed() const = 0;
+    virtual void MarkComputed(const bool hasComputed) = 0;
 
-        virtual void Save(File& fstream) const override
-        {
-            Base::Save(fstream);
-            fstream << m_hasComputed;
-            fstream << Value();
-        }
+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_hasComputed;
+        fstream << Value();
+    }
 
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            Base::Load(fstream, modelVersion);
-            fstream >> m_hasComputed;
-            LoadValue(fstream);
-        }
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        fstream >> m_hasComputed;
+        LoadValue(fstream);
+    }
 
-        virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
-        {
-            Base::DumpNodeInfo(printValues, fstream);
+    virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
+    {
+        Base::DumpNodeInfo(printValues, fstream);
 
-            const size_t BUFLEN = 4096;
-            WCHAR str[BUFLEN];
-            swprintf(str, BUFLEN, L"[%s%s]  ", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
-            fstream << wstring(str);
-            swprintf(str, BUFLEN, L"HasComputed=%ls", HasComputed() ? L"true" : L"false");
-            fstream << wstring(str);
+        const size_t BUFLEN = 4096;
+        WCHAR str[BUFLEN];
+        swprintf(str, BUFLEN, L"[%s%s]  ", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
+        fstream << wstring(str);
+        swprintf(str, BUFLEN, L"HasComputed=%ls", HasComputed() ? L"true" : L"false");
+        fstream << wstring(str);
 
-            PrintNodeValuesToFile(printValues, fstream);
-        }
+        PrintNodeValuesToFile(printValues, fstream);
+    }
 
-    protected:
+protected:
     Matrix<ElemType> m_memory; // the memory of input or output
-        bool m_hasComputed;
-    };
+    bool m_hasComputed;
+};
 
-    // add this at the start of each derived class, to get access to the members of ComputationNode
-    // See #define of 'UsingComputationNodeMembersBoilerplate' for more explanation.
+// add this at the start of each derived class, to get access to the members of ComputationNode
+// See #define of 'UsingComputationNodeMembersBoilerplate' for more explanation.
 #define UsingBatchModeNodeMembers           \
     UsingComputationNodeMembersBoilerplate; \
-                                            \
+    \
 protected:                                  \
     using Base::m_memory;                   \
     using Base::m_hasComputed;              \
-                                            \
+    \
 public:                                     \
     using Base::HasComputed;                \
     using Base::MarkComputed
 
-    // -----------------------------------------------------------------------
-    // TimeReverseNode (input)
-    // BUGBUG: This must actually implement reversing the layout.
-    // Challenge: This reverses the layout. If we time-reverse back, we'd reverse the layout again.
-    // We will get the original layout. Unfortunately, it is not the same layout pointer.
-    // To turn it back to the same layout pointer, insert a ReconcileMBLayout node.
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// TimeReverseNode (input)
+// BUGBUG: This must actually implement reversing the layout.
+// Challenge: This reverses the layout. If we time-reverse back, we'd reverse the layout again.
+// We will get the original layout. Unfortunately, it is not the same layout pointer.
+// To turn it back to the same layout pointer, insert a ReconcileMBLayout node.
+// -----------------------------------------------------------------------
 
-    /**
+/**
     Developed by Kaisheng Yao.
     This node is used in the following work
     K. Yao and G. Zweig, "Sequence-to-Sequence Neural Net Models for Grapheme-to-Phoneme Conversion", submitted to INTERSPEECH 2015
     */
 template <class ElemType>
-    class TimeReverseNode : public BatchModeNode<ElemType>, public NumInputs<1>
-    {
+class TimeReverseNode : public BatchModeNode<ElemType>, public NumInputs<1>
+{
     typedef BatchModeNode<ElemType> Base;
     UsingBatchModeNodeMembers;
     static const std::wstring TypeName()
@@ -720,23 +726,23 @@ template <class ElemType>
         return L"TimeReverse";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(TimeReverseNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(TimeReverseNode);
     TimeReverseNode(DEVICEID_TYPE deviceId, const wstring& name)
         : BatchModeNode<ElemType>(deviceId, name)
     {
     }
 
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = dynamic_pointer_cast<TimeReverseNode<ElemType>>(nodeP);
-                // TODO: m_memory is never used inside this class, just assigned. Can it not be assigned?
-                node->m_memory = m_memory;
-            }
+            auto node = dynamic_pointer_cast<TimeReverseNode<ElemType>>(nodeP);
+            // TODO: m_memory is never used inside this class, just assigned. Can it not be assigned?
+            node->m_memory = m_memory;
         }
+    }
 
     virtual bool HasComputed() const
     {
@@ -747,130 +753,130 @@ template <class ElemType>
         m_hasComputed = hasComputed;
     }
 
-        virtual void BackpropToNonLooping(size_t inputIndex) override
-        {
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
         assert(inputIndex == 0);
         inputIndex;
-            VerifyDims(Input(0));
+        VerifyDims(Input(0));
 
-            size_t nT = GetNumTimeSteps();
-            for (size_t t = 0; t < nT; t++)
-            {
+        size_t nT = GetNumTimeSteps();
+        for (size_t t = 0; t < nT; t++)
+        {
             Matrix<ElemType> g = GradientFor(FrameRange(GetMBLayout(), t));
-                Matrix<ElemType> ig = Input(0)->GradientFor(FrameRange(Input(0)->GetMBLayout(), nT - 1 - t));
-                ig += g;
-            }
+            Matrix<ElemType> ig = Input(0)->GradientFor(FrameRange(Input(0)->GetMBLayout(), nT - 1 - t));
+            ig += g;
         }
+    }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The TimeReverseNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The TimeReverseNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The TimeReverseNode does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        // The TimeReverseNode does not require any of it's input's values for computing
+        // the gradients of its input nodes
+        UNREFERENCED_PARAMETER(childIndex);
+        return false;
+    }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
+    {
+        // BUGBUG: We must flip the layout, too.
+        if (GetNumParallelSequences() != 1)
+            LogicError("%ls %ls operation not implemented for multiple parallel sequences. It does not flip the layout either. I.e. only works for a single utterance.", NodeName().c_str(), OperationName().c_str());
+        if (!m_hasComputed)
         {
-            // BUGBUG: We must flip the layout, too.
-            if (GetNumParallelSequences() != 1)
-                LogicError("%ls %ls operation not implemented for multiple parallel sequences. It does not flip the layout either. I.e. only works for a single utterance.", NodeName().c_str(), OperationName().c_str());
-            if (!m_hasComputed)
-            {
-                // this assumes this reverse node is called once, so it can set, instead add to, the function values
-                SetDims(Input(0));
-                UpdateFunctionValuesSize();
+            // this assumes this reverse node is called once, so it can set, instead add to, the function values
+            SetDims(Input(0));
+            UpdateFunctionValuesSize();
 
-                size_t nT = GetNumTimeSteps();
-                for (size_t t = 0; t < nT; t++)
-                {
-                    Matrix<ElemType> v = Input(0)->ValueFor(FrameRange(Input(0)->GetMBLayout(), t));
-                    ValueFor(FrameRange(GetMBLayout(), nT - 1 - t)).SetValue(v);
-                }
+            size_t nT = GetNumTimeSteps();
+            for (size_t t = 0; t < nT; t++)
+            {
+                Matrix<ElemType> v = Input(0)->ValueFor(FrameRange(Input(0)->GetMBLayout(), t));
+                ValueFor(FrameRange(GetMBLayout(), nT - 1 - t)).SetValue(v);
+            }
 
 #if NANCHECK
-                Value().HasNan("TimeReverse");
+            Value().HasNan("TimeReverse");
 #endif
 #if DUMPOUTPUT
-                Value().Print("TimeReverseNode");
+            Value().Print("TimeReverseNode");
 #endif
 
-                m_memory.SetValue(Value());
-            }
-            // TODO: don't need to set m_hasCompute? Or what is it for?
+            m_memory.SetValue(Value());
         }
+        // TODO: don't need to set m_hasCompute? Or what is it for?
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
-            if (isFinalValidationPass && !m_pMBLayout)
-                RuntimeError("%ls %ls operation makes no sense without a MB layout.", NodeName().c_str(), OperationName().c_str());
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
+        if (isFinalValidationPass && !m_pMBLayout)
+            RuntimeError("%ls %ls operation makes no sense without a MB layout.", NodeName().c_str(), OperationName().c_str());
 
-            SetDims(Input(0));
-        }
+        SetDims(Input(0));
+    }
 
-    public:
+public:
     bool UnitTest()
     {
-            size_t nT = 3;
-            size_t nInput = 3;
-            size_t nOutput = nInput;
-
-            Input(0)->SetDims1(nInput, nT);
-            Input(0)->UpdateFunctionValuesSize();
-            Input(0)->Value().SetValue(0);
-            Input(0)->Value()(0, 0) = 1;
-            Input(0)->Value()(0, 1) = 2;
-            Input(0)->Value()(0, 2) = 3;
-            SetDims1(nOutput, nT);
-            UpdateFunctionValuesSize();
+        size_t nT = 3;
+        size_t nInput = 3;
+        size_t nOutput = nInput;
+
+        Input(0)->SetDims1(nInput, nT);
+        Input(0)->UpdateFunctionValuesSize();
+        Input(0)->Value().SetValue(0);
+        Input(0)->Value()(0, 0) = 1;
+        Input(0)->Value()(0, 1) = 2;
+        Input(0)->Value()(0, 2) = 3;
+        SetDims1(nOutput, nT);
+        UpdateFunctionValuesSize();
         Input(0)->Value().TransferToDeviceIfNotThere(m_deviceId, true);
-            ForwardProp(FrameRange(m_pMBLayout));
+        ForwardProp(FrameRange(m_pMBLayout));
 
-            /// check with expected values
-            if (!ISCLOSE(Value()(0, 0), 3, EPSILON) ||
-                !ISCLOSE(Value()(0, 1), 2, EPSILON) ||
-                !ISCLOSE(Value()(0, 2), 1, EPSILON))
-            {
-                return false;
-            }
+        /// check with expected values
+        if (!ISCLOSE(Value()(0, 0), 3, EPSILON) ||
+            !ISCLOSE(Value()(0, 1), 2, EPSILON) ||
+            !ISCLOSE(Value()(0, 2), 1, EPSILON))
+        {
+            return false;
+        }
 
         Value().TransferToDeviceIfNotThere(m_deviceId, true);
 
-            Input(0)->Gradient().Resize(nOutput, nT);
-            Input(0)->Gradient().SetValue(1.0);
-            Gradient().Resize(nOutput, nT);
-            Gradient().SetValue(0);
-            Gradient()(0, 0) = 1;
-            Gradient()(0, 1) = 2;
-            Gradient()(0, 2) = 3;
+        Input(0)->Gradient().Resize(nOutput, nT);
+        Input(0)->Gradient().SetValue(1.0);
+        Gradient().Resize(nOutput, nT);
+        Gradient().SetValue(0);
+        Gradient()(0, 0) = 1;
+        Gradient()(0, 1) = 2;
+        Gradient()(0, 2) = 3;
         Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
 
-            BackpropTo(0, FrameRange(m_pMBLayout));
+        BackpropTo(0, FrameRange(m_pMBLayout));
 
-            /// check with expected values
-            if (!ISCLOSE(Input(0)->Gradient()(0, 0), 4, EPSILON) ||
-                !ISCLOSE(Input(0)->Gradient()(0, 1), 3, EPSILON) ||
-                !ISCLOSE(Input(0)->Gradient()(0, 2), 2, EPSILON))
-            {
-                return false;
-            }
+        /// check with expected values
+        if (!ISCLOSE(Input(0)->Gradient()(0, 0), 4, EPSILON) ||
+            !ISCLOSE(Input(0)->Gradient()(0, 1), 3, EPSILON) ||
+            !ISCLOSE(Input(0)->Gradient()(0, 2), 2, EPSILON))
+        {
+            return false;
+        }
 
-            Input(0)->Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
-            Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
+        Input(0)->Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
+        Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
 
-            return true;
-        }
-    };
+        return true;
+    }
+};
 
-    template class TimeReverseNode<float>;
-    template class TimeReverseNode<double>;
+template class TimeReverseNode<float>;
+template class TimeReverseNode<double>;
 } } }
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 7682c1f1a07f..9f1190671259 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -1012,7 +1012,6 @@ void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDCo
             redU.RowElementMultiplyWith(redS.Transpose());
             redVT.ColumnElementMultiplyWith(redS);
 
-
             // Step 2. create two new Parameter nodes and one Times node
             wstring leftChildName = name + L"-U";
             wstring rightChildName = name + L"-V";
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 34d82c412d43..78f254e58ab5 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -44,8 +44,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     ComputationNetwork()
         : m_randomSeedOffset(0),
-        m_isCompiled(false),
-        m_pMBLayout(make_shared<MBLayout>())
+          m_isCompiled(false),
+          m_pMBLayout(make_shared<MBLayout>())
     {
     }
     ComputationNetwork(DEVICEID_TYPE deviceId)
@@ -402,7 +402,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         {
             if (NodeNameExists(name))
                 nodes.push_back(GetNodeFromName(name));
-            }
+        }
         else
         {
             std::wstring head = name.substr(0, found);
@@ -416,8 +416,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                 bool tailMatch = tail.empty() || nodeName.rfind(tail) == nodeName.size() - tail.size();
                 if (headMatch && tailMatch)
                     nodes.push_back(nodeIter->second);
-                }
             }
+        }
         return nodes;
     }
 
@@ -429,10 +429,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
 
     template <class ElemType>
-    static void SetSeqParam(ComputationNetworkPtr net, 
-                            const ComputationNodeBasePtr criterionNode, 
+    static void SetSeqParam(ComputationNetworkPtr net,
+                            const ComputationNodeBasePtr criterionNode,
                             const double& hsmoothingWeight,
-                            const double& frameDropThresh, 
+                            const double& frameDropThresh,
                             const bool& doreferencealign,
                             const double& amf = 14.0f,
                             const double& lmf = 14.0f,
@@ -464,10 +464,22 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     }
 
     // these are specified as such by the user
-    inline       std::vector<ComputationNodeBasePtr>& FeatureNodes()       { return m_features; }
-    inline const std::vector<ComputationNodeBasePtr>& FeatureNodes() const { return m_features; }
-    inline std::vector<ComputationNodeBasePtr>& LabelNodes()               { return m_labels; }
-    inline std::vector<ComputationNodeBasePtr>& FinalCriterionNodes()      { return m_finalCriteria; }
+    inline std::vector<ComputationNodeBasePtr>& FeatureNodes()
+    {
+        return m_features;
+    }
+    inline const std::vector<ComputationNodeBasePtr>& FeatureNodes() const
+    {
+        return m_features;
+    }
+    inline std::vector<ComputationNodeBasePtr>& LabelNodes()
+    {
+        return m_labels;
+    }
+    inline std::vector<ComputationNodeBasePtr>& FinalCriterionNodes()
+    {
+        return m_finalCriteria;
+    }
 
     inline std::vector<ComputationNodeBasePtr> CriterionNodesFrom(const wstring& criterionNodeName)
     {
@@ -475,12 +487,21 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         ValidateSubNetwork(node);
         if (node->HasMBLayout() || node->GetSampleLayout().GetNumElements() != 1)
             InvalidArgument("%ls %ls operation is not a valid training or eval criterion node.", node->NodeName().c_str(), node->OperationName().c_str());
-        return std::vector<ComputationNodeBasePtr>{ node };
+        return std::vector<ComputationNodeBasePtr>{node};
     }
 
-    inline std::vector<ComputationNodeBasePtr>& EvaluationNodes() { return m_evalNodes; }
-    inline std::vector<ComputationNodeBasePtr>& OutputNodes()     { return m_outputNodes; }
-    inline std::vector<ComputationNodeBasePtr>& PairNodes()       { return m_pairNodes; }
+    inline std::vector<ComputationNodeBasePtr>& EvaluationNodes()
+    {
+        return m_evalNodes;
+    }
+    inline std::vector<ComputationNodeBasePtr>& OutputNodes()
+    {
+        return m_outputNodes;
+    }
+    inline std::vector<ComputationNodeBasePtr>& PairNodes()
+    {
+        return m_pairNodes;
+    }
 
     // -----------------------------------------------------------------------
     // node access
@@ -595,11 +616,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     };
 
 protected:
-    // -----------------------------------------------------------------------
-    // construction
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// construction
+// -----------------------------------------------------------------------
 
-    // Copy constructor, should never be called.
+// Copy constructor, should never be called.
 #pragma warning(push)
 #pragma warning(disable : 4702) // this function is flagged but unclear why
     ComputationNetwork(const ComputationNetwork& /*deepCopyFrom*/)
@@ -691,7 +712,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             else //node name is not found, dump all nodes
             {
                 fprintf(stderr, "Warning: node name %ls does not exist in the network. dumping all nodes.\n",
-                    nodeName.c_str());
+                        nodeName.c_str());
                 DumpAllNodesToFile(printValues, outputFile);
             }
         }
@@ -809,7 +830,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         virtual void ForwardProp(const FrameRange&) override;
         virtual void EndForwardProp() override;
         virtual void BeginBackprop() override;
-        virtual void BackpropTo(const size_t inputIndex, const FrameRange&) override { NOT_IMPLEMENTED; }
+        virtual void BackpropTo(const size_t inputIndex, const FrameRange&) override
+        {
+            NOT_IMPLEMENTED;
+        }
         virtual void EndBackprop() override;
         virtual void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override;
         virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool);
@@ -827,7 +851,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         SEQTraversalFlowControlNode(int loopId, ComputationNodeBasePtr cur)
             : m_loopId(loopId),
-            m_sourceNode(cur)
+              m_sourceNode(cur)
         {
             SetNodeName(L"Loop_" + m_sourceNode->NodeName());
         }
@@ -919,7 +943,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         return vector<std::vector<ComputationNodeBasePtr>*>{&m_features, &m_labels, &m_finalCriteria, &m_evalNodes, &m_outputNodes, &m_pairNodes};
     }
 
-    // used for sentence boundary information passed from reader to reset RNN state 
+    // used for sentence boundary information passed from reader to reset RNN state
     // specify how the minibatch is packed for each sample
     // TODO: This will change once we allow for multiple inconsistent layouts.
     MBLayoutPtr m_pMBLayout; // note that this must be installed before doing anything that needs it (default leaves a nullptr)
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 1720aa1bd317..cf8464505662 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -432,7 +432,7 @@ void ComputationNetwork::CompileNetwork()
     // :)
 
     // STEP: Some final details.
-    ResetEvalTimeStamps();     // invalidate all m_value fields. Really belongs into StartEvaluateMinibatchLoop()
+    ResetEvalTimeStamps(); // invalidate all m_value fields. Really belongs into StartEvaluateMinibatchLoop()
 
     fprintf(stderr, "\nPost-processing network complete.\n");
     m_isCompiled = true;
@@ -633,7 +633,7 @@ void ComputationNetwork::ValidateSubNetwork(const ComputationNodeBasePtr& rootNo
 }
 
 // helper to discover dimension changes
-static pair<TensorShape, bool> GetDims(const ComputationNodeBasePtr & node)
+static pair<TensorShape, bool> GetDims(const ComputationNodeBasePtr& node)
 {
     return make_pair(node->GetSampleLayout(), node->HasMBLayout());
 }
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 3444ca412696..5642121f450d 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -115,7 +115,7 @@ void ComputationNodeBase::ValidateBinaryReduce(bool isFinalValidationPass)
     m_pMBLayout = nullptr; // this node does not hold mini-batch data
     ValidateInferBinaryInputDims();
     if (isFinalValidationPass &&
-        !(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) &&   // TODO: Do we need broadcasting for these cases?
+        !(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) && // TODO: Do we need broadcasting for these cases?
           (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout())))
         LogicError("The Matrix dimensions or MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
     SetDims(TensorShape(1), false);
@@ -143,19 +143,19 @@ void ComputationNodeBase::ValidateInferBinaryInputDims()
 
 // in case of an error, we just back out, and leave it to outside code to detect errors
 template <class ElemType>
-void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape & otherShape)
+void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape& otherShape)
 {
-    if (OperationName() != OperationNameOf(LearnableParameter))   // only infer LearnableParameters (we can't propagate further)
+    if (OperationName() != OperationNameOf(LearnableParameter)) // only infer LearnableParameters (we can't propagate further)
         return;
 
     // see where we stand with our shape
     bool hasMissingDims = m_sampleLayout.GetRank() == 0 || m_sampleLayout.GetNumElements() == 0;
-    if (!hasMissingDims)        // all there--nothing to infer
+    if (!hasMissingDims) // all there--nothing to infer
         return;
 
     // infer at least one dimension
     if (otherShape.GetRank() == 0 || otherShape.GetNumElements() == 0)
-        return;// LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
+        return; // LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
 
     // if no dimensions have been set at all, copy otherShape
     // Don't verify dimensions in this case, because the node may have explicitly been defined as a vector of 0 elements.
@@ -164,10 +164,10 @@ void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape & o
         hasAnyDim |= dim != 0;
     if (!hasAnyDim)
         m_sampleLayout = otherShape;
-    else if (hasMissingDims)    // we got a pre-existing shape: If it has zeroes, we fill them in from otherShape
+    else if (hasMissingDims) // we got a pre-existing shape: If it has zeroes, we fill them in from otherShape
     {
         if (m_sampleLayout.GetRank() != 0 && m_sampleLayout.GetRank() != otherShape.GetRank())
-            return;// LogicError("ValidateInferInputDimsFrom: Inferred dimensions must match in rank.");
+            return; // LogicError("ValidateInferInputDimsFrom: Inferred dimensions must match in rank.");
         SmallVector<size_t> newDims = m_sampleLayout.GetDims();
         for (size_t i = 0; i < m_sampleLayout.GetRank(); i++)
             if (newDims[i] == 0)
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 87453a6c617b..26dca071ad9c 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -33,7 +33,7 @@
 
 #pragma warning(disable : 4267) // conversion from size_t to int or other types
 
-// version number to control how to read and write 
+// version number to control how to read and write
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
 #define CURRENT_CNTK_MODEL_VERSION 2
@@ -51,30 +51,30 @@ extern bool g_shareNodeValueMatrices;
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 enum CopyNodeFlags // flags to be passed to the CopyTo() function
-    {
+{
     copyNodeNull = 0,                 // invalid value
     copyNodeValue = 1,                // copy everything but the children links
     copyNodeChildren = 2,             // only copy over children links
     copyNodeAll = 3,                  // copy everything
     copyNodeChildrenCrossNetwork = 4, // allow a cross network child copy
-    };
+};
 
 #pragma region base computation class
 
-    // =======================================================================
-    // IComputationNode -- set of methods that are to be implemented (or optionally overridable) by node implementations.
-    // =======================================================================
+// =======================================================================
+// IComputationNode -- set of methods that are to be implemented (or optionally overridable) by node implementations.
+// =======================================================================
 
-    class ComputationNodeBase;
+class ComputationNodeBase;
 struct /*interface*/ IComputationNode
-    {
-        typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
+{
+    typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
 
-        // --- these must be implemented by each node
+    // --- these must be implemented by each node
 
     virtual ComputationNodeBase* NewThis(DEVICEID_TYPE deviceId, const wstring& name) = 0;
-        // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
-        virtual const std::wstring OperationName() const = 0;
+    // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
+    virtual const std::wstring OperationName() const = 0;
 #define OperationNameOf(T) (T<float>::TypeName()) // convenience macro
 
     virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps.
@@ -87,86 +87,86 @@ struct /*interface*/ IComputationNode
     virtual void BackpropTo(const size_t inputIndex, const FrameRange&) = 0; // backprop gradient into one of the inputs
     virtual void EndBackprop() = 0;                                          // called after last iteration step of ComputeGradient()
 
-        // --- these are meant to be overridden by ControlFlowNodes
+    // --- these are meant to be overridden by ControlFlowNodes
 
     virtual void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) = 0;
 
-        // --- optional overrides that add functionality
+    // --- optional overrides that add functionality
 
-        // Any override must call Base version as well.
-        // Default implementations are in ComputationNodeBase or ComputationNode<ElemType>.
+    // Any override must call Base version as well.
+    // Default implementations are in ComputationNodeBase or ComputationNode<ElemType>.
 
     virtual void Validate(bool isFinalValidationPass) = 0; // main base validation function
-        virtual void Save(File& fstream) const = 0;
-        virtual void Load(File& /*fstream*/, size_t /*modelVersion*/) = 0;
-        virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const = 0;
+    virtual void Save(File& fstream) const = 0;
+    virtual void Load(File& /*fstream*/, size_t /*modelVersion*/) = 0;
+    virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const = 0;
 
     virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) = 0; // request matrices needed to do node function value evaluation
     virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) = 0;  // release temp matrices that are only used by forward computation. Don't release matrices that need to be used in the gradient computation
-        virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) = 0;
+    virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) = 0;
     virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) = 0; // request matrices that are needed for gradient computation
     virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) = 0;  // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
 
-        // --- optional overrides that describe a feature or property of the node
+    // --- optional overrides that describe a feature or property of the node
 
     virtual bool RequiresPreCompute() const = 0; // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
 
-        // --- optional overrides for more informative logging
+    // --- optional overrides for more informative logging
 
     virtual void PrintSelfBeforeValidation() const = 0; // called in validation loop right before Validate()
-        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const = 0;
+    virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const = 0;
 
-    protected:
+protected:
     virtual ~IComputationNode()
     {
     }
-    };
+};
 
-    // =======================================================================
-    //  This provide a interface for stateful node (e.g., DelayNodeBase) and definition of state
-    //  This interface allows to Export and Import state from elsewhere 
-    //  It is needed when doing sub-minibatch implementation 
-    // =======================================================================
+// =======================================================================
+//  This provide a interface for stateful node (e.g., DelayNodeBase) and definition of state
+//  This interface allows to Export and Import state from elsewhere
+//  It is needed when doing sub-minibatch implementation
+// =======================================================================
 
 class INodeState : public std::enable_shared_from_this<INodeState>
-    {
-    public:
+{
+public:
     virtual ~INodeState()
     {
     }
-    };
+};
 
-    struct /*interface*/ IStatefulNode
-    {
-        typedef std::shared_ptr<INodeState> NodeStatePtr;
-        virtual NodeStatePtr ExportState() = 0;
+struct /*interface*/ IStatefulNode
+{
+    typedef std::shared_ptr<INodeState> NodeStatePtr;
+    virtual NodeStatePtr ExportState() = 0;
     virtual void ImportState(const NodeStatePtr& state) = 0;
-    };
-    typedef IStatefulNode::NodeStatePtr NodeStatePtr;
+};
+typedef IStatefulNode::NodeStatePtr NodeStatePtr;
 
-    // =======================================================================
-    // ComputationNetworkOwnedNodeState -- class to collect ComputationNode members that are really owned by ComputationNetwork
-    // These members are only to be set, changed, and read by ComputationNetwork code.
-    // =======================================================================
+// =======================================================================
+// ComputationNetworkOwnedNodeState -- class to collect ComputationNode members that are really owned by ComputationNetwork
+// These members are only to be set, changed, and read by ComputationNetwork code.
+// =======================================================================
 
-    class ComputationNetwork;
-    struct ComputationNetworkOwnedNodeState
-    {
-        friend class ComputationNetwork;
+class ComputationNetwork;
+struct ComputationNetworkOwnedNodeState
+{
+    friend class ComputationNetwork;
 
     ComputationNetworkOwnedNodeState()
         : m_needsGradient(false), m_valueSharable(true)
-        {
-            PurgeStateForFormingRecurrentLoops();
-            m_isPartOfLoop = false;
-        }
+    {
+        PurgeStateForFormingRecurrentLoops();
+        m_isPartOfLoop = false;
+    }
 
     void CopyTo(ComputationNetworkOwnedNodeState& other) const
-        {
-            // TODO: is that really all we copy? (this is a result of refactoring, so it seems yes indeed). Should we at least ClearCache()?
-            other.m_isPartOfLoop = m_isPartOfLoop;
-            other.m_needsGradient = m_needsGradient;
-        }
+    {
+        // TODO: is that really all we copy? (this is a result of refactoring, so it seems yes indeed). Should we at least ClearCache()?
+        other.m_isPartOfLoop = m_isPartOfLoop;
+        other.m_needsGradient = m_needsGradient;
+    }
 
     bool IsPartOfLoop() const
     {
@@ -185,46 +185,46 @@ class INodeState : public std::enable_shared_from_this<INodeState>
     {
         return m_valueSharable;
     }
-        
+
 protected:                // TODO: should be fully encapsulated here
     bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
 
     bool m_valueSharable; // a flag is needed for memory share.
-                                // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), 
-                                // it will never be released to memory pool 
-    private:
+                          // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters),
+                          // it will never be released to memory pool
+private:
     bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
 
-    protected:
-        // owned by FormRecurrentLoops() and stuff it calls, only used from inside there (FormRecurrentLoops() calls PurgeStateForFormingRecurrentLoops() at its end to make that super-clear)
-        void PurgeStateForFormingRecurrentLoops()
-        {
-            m_loopId = -1;
-            m_visitedOrder = -1;
-            m_indexInLoop = 0;
-            m_visited = false;
-            m_index = -1;
-            m_minIndex = -1;
-            m_inStack = false;
-        }
+protected:
+    // owned by FormRecurrentLoops() and stuff it calls, only used from inside there (FormRecurrentLoops() calls PurgeStateForFormingRecurrentLoops() at its end to make that super-clear)
+    void PurgeStateForFormingRecurrentLoops()
+    {
+        m_loopId = -1;
+        m_visitedOrder = -1;
+        m_indexInLoop = 0;
+        m_visited = false;
+        m_index = -1;
+        m_minIndex = -1;
+        m_inStack = false;
+    }
 
     int m_loopId;       // index into m_allSEQNodes array, for use by reordering operation only
     int m_visitedOrder; // remembers order in which nodes were visited by EnumerateNodes(), but gets updated
     bool m_visited;     // note: also used by ValidateSubNetwork()
-        int m_indexInLoop;
-        // only used inside DetermineSCCs():
+    int m_indexInLoop;
+    // only used inside DetermineSCCs():
     int m_index;    // index denoting order in which nodes were visited in DetermineSCCs()
     int m_minIndex; // min of m_index over all nodes within a single loop
-        bool m_inStack;
-    };
+    bool m_inStack;
+};
 
-    // =======================================================================
-    // TimeStamp -- helper class to manage a "time stamp" (unique value) of a computation result to avoid recomputation
-    // =======================================================================
+// =======================================================================
+// TimeStamp -- helper class to manage a "time stamp" (unique value) of a computation result to avoid recomputation
+// =======================================================================
 
-    class TimeStamp
-    {
-    public:
+class TimeStamp
+{
+public:
     TimeStamp()
     {
         ResetEvalTimeStamp();
@@ -242,45 +242,45 @@ class INodeState : public std::enable_shared_from_this<INodeState>
         return m_evalTimeStamp;
     }
 
-        // create a new unique time stamp
+    // create a new unique time stamp
     void BumpEvalTimeStamp()
     {
         m_evalTimeStamp = CreateUniqId();
     }
 
-        // the difference is taken to take into account numeric overflow (which really should never happen for a 64-bit integer... but hey, it's free!)
+    // the difference is taken to take into account numeric overflow (which really should never happen for a 64-bit integer... but hey, it's free!)
     bool IsOlderThan(const TimeStamp& other) const
-        {
-            // BUGBUG: For some reason, we must test equality as well, although that does not indicate being older.
-            return GetEvalTimeStamp() - other.GetEvalTimeStamp() /*<*/ <= 0;
-        }
+    {
+        // BUGBUG: For some reason, we must test equality as well, although that does not indicate being older.
+        return GetEvalTimeStamp() - other.GetEvalTimeStamp() /*<*/ <= 0;
+    }
 
-        int64_t CreateUniqId() const
-        {
-            return /*1 +*/ atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
-        }
+    int64_t CreateUniqId() const
+    {
+        return /*1 +*/ atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
+    }
 
-    private:
-        static atomic_ullong s_timeStampCounter;
-        int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
-    };
+private:
+    static atomic_ullong s_timeStampCounter;
+    int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
+};
 
-    // =======================================================================
-    // ComputationNodeBase -- abstract base class for all computation nodes
-    // =======================================================================
+// =======================================================================
+// ComputationNodeBase -- abstract base class for all computation nodes
+// =======================================================================
 
 class ComputationNodeBase : public IComputationNode,
                             public /*protected*/ ComputationNetworkOwnedNodeState, // TODO: figure the 'protected' business out, somehow the 'friend' thing does not work
                             public TimeStamp,                                      // for time-stamp management
-        public ScriptableObjects::ComputationNodeObject,
+                            public ScriptableObjects::ComputationNodeObject,
                             public ScriptableObjects::WithTag,
                             public ScriptableObjects::HasName,
                             public ScriptableObjects::HasToString,
-        public std::enable_shared_from_this<ComputationNodeBase>
-    {
-        // note: enable_shared_from_this<> allows to create a shared_ptr from a raw pointer to this that is correctly aware of all other shared_ptrs (same ref count)
-    public:
-        typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
+                            public std::enable_shared_from_this<ComputationNodeBase>
+{
+    // note: enable_shared_from_this<> allows to create a shared_ptr from a raw pointer to this that is correctly aware of all other shared_ptrs (same ref count)
+public:
+    typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
 
     ComputationNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
         : m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name)
@@ -290,147 +290,154 @@ class ComputationNodeBase : public IComputationNode,
     {
     }
 
-        virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const
+    virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const
+    {
+        if (OperationName() != node->OperationName())
+            RuntimeError("Cannot copy from one node type to another node type");
+        if (flags & CopyNodeFlags::copyNodeChildren)
         {
-            if (OperationName() != node->OperationName())
-                RuntimeError("Cannot copy from one node type to another node type");
-            if (flags & CopyNodeFlags::copyNodeChildren)
-            {
-                node->m_inputs = m_inputs;
-            }
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_deviceId = m_deviceId;
-                node->m_parameterUpdateRequired = m_parameterUpdateRequired;
-                node->m_nodeName = newName;
+            node->m_inputs = m_inputs;
+        }
+        if (flags & CopyNodeFlags::copyNodeValue)
+        {
+            node->m_deviceId = m_deviceId;
+            node->m_parameterUpdateRequired = m_parameterUpdateRequired;
+            node->m_nodeName = newName;
 
-                node->m_sampleLayout = m_sampleLayout;
+            node->m_sampleLayout = m_sampleLayout;
 
-                ComputationNetworkOwnedNodeState::CopyTo(*node);
-                TimeStamp::CopyTo(*node);
-            }
+            ComputationNetworkOwnedNodeState::CopyTo(*node);
+            TimeStamp::CopyTo(*node);
         }
+    }
 
-        virtual ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) = 0;
+    virtual ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) = 0;
 
-        // TODO: make sure this does not get implemented in any of the base classes
+    // TODO: make sure this does not get implemented in any of the base classes
     DEVICEID_TYPE GetDeviceId() const
     {
         return m_deviceId;
     } // TODO: remove, only used from copy constructor which will go away
 
-        virtual void Save(File& fstream) const
-        {
-            fstream << OperationName() << NodeName();
-        }
+    virtual void Save(File& fstream) const
+    {
+        fstream << OperationName() << NodeName();
+    }
 
-        virtual void Load(File& /*fstream*/, size_t /*modelVersion*/)
-        {
-            // it is assumed that OperationName and NodeName have already been consumed--some asymmetry between Save and Load
-            // base class has nothing to load
-        }
+    virtual void Load(File& /*fstream*/, size_t /*modelVersion*/)
+    {
+        // it is assumed that OperationName and NodeName have already been consumed--some asymmetry between Save and Load
+        // base class has nothing to load
+    }
 
-        // dimensions
-
-        // The value of a node is a tensor in one of two variants:
-        //
-        //  - single matrix, vector, tensor
-        //     - m_sampleLayout contains the shape. Accessed through GetSampleLayout().
-        //     - m_pMBLayout is null
-        //  - minibatch data
-        //     - consists of many samples which are all tensors of m_sampleLayout
-        //     - adds two additional tensor dimensions, time step and parallel sequence
-        //       These change for each minibatch and are unknown during validation.
-        //     - m_sampleLayout is the tensor shape of the samples
-        //     - m_pMBLayout defines the number of time steps and parallel sequences (="tensor shape" of the minibatch)
-        //       Accessed through GetMBLayout(); test for through HasMBLayout().
-        //
-        // The values can be accessed in three ways:
-        //
-        //  - as a tensor
-        //     - GetTensorShape() forms the joint tensor that incorporates both m_sampleLayout and, if present, m_pMBLayout
-        //        - Elementwise tensor operations operate on these.
-        //        - If no MBLayout is present in one of multiple elementwise operands, it will be interpreted as a one-sample minibatch that broadcasts to all samples.
-        //     - learnable parameters hold tensors that are not minibatches
-        //  - as a sample matrix
-        //     - many nodes do not care about the specific sample-tensor dimensions
-        //     - but may care about selecting a single time step out of a minibatch
-        //     - minibatch: each matrix column contains a sample tensor flattened, with one column per time step and parallel sequence
-        //     - tensor: one column containing the sample tensor flattened
-        //     - GetSampleMatrixNumRows(), GetSampleMatrixNumCols()
-        //  - as a Matrix reference
-        //     - actual object is a 2D tensor without MB Layout
-        //     - ValueAsMatrix(), GradientAsMatrix() returns tensor as a 2D Matrix object
-        //     - nodes that do this are: TimesNode, DiagTimesNode, ConvolutionNode, NoiseContrastiveEstimationNode, ClassBasedCrossEntropyWithSoftmaxNode, TransposeNode, DiagonalNode
-        //
-        // How values are stored:
-        //
-        //  - minibatch: Matrix of columns, where each column is a sample
-        //  - tensor: Matrix where column dimension contains all but the first dimension
-        //     - This only matters for sparse matrices, which cannot easily be Reshaped().
-        //       For those, we keep the underlying storage identical to the semantic meaning.
-
-        // interpretation as a set of samples
-        const TensorShape& GetSampleLayout() const { return m_sampleLayout; }
-        bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; }      // does it have a layout that is not just a vector?
-
-        // interpretation as sample matrix (each column is a sample, individual sample tensor dimensions do not matter for the operation)
-        size_t GetSampleMatrixNumRows() const
-        {
-            return m_sampleLayout.GetNumElements();
-        }
-        size_t GetSampleMatrixNumCols() const
-        {
-            if (HasMBLayout())
-                return GetMBLayout()->GetNumCols();
-            else
-                return 1;   // no layout: treat as 1-sample minibatch that is meant to broadcast
-        }
-        // determine if we are the output of an op over 'other', whether that would be a reduction, so that we need to mask
-        bool ReducesInTimeWrt(const ComputationNodeBasePtr & other) const
-        {
-            return GetSampleMatrixNumCols() < other->GetSampleMatrixNumCols();
-        }
+    // dimensions
 
-        // interpretation as a Matrix reference
-    private:
-        void CheckTensorIsMatrix() const
-        {
-            if (HasMBLayout())
-                LogicError("CheckTensorIsMatrix: Minibatch data cannot be interpreted as a single 2D tensor.");
-            else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2)  // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
-                LogicError("CheckTensorIsMatrix: Sample is not a column vector or matrix (1D or 2D tensor).");
-        }
-    public:
-        size_t GetAsMatrixNumRows() const
-        {
-            CheckTensorIsMatrix();
-            return m_sampleLayout[0];
-        }
-        size_t GetAsMatrixNumCols() const
-        {
-            CheckTensorIsMatrix();
-            return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1;    // a column vector is also a Matrix
-        }
+    // The value of a node is a tensor in one of two variants:
+    //
+    //  - single matrix, vector, tensor
+    //     - m_sampleLayout contains the shape. Accessed through GetSampleLayout().
+    //     - m_pMBLayout is null
+    //  - minibatch data
+    //     - consists of many samples which are all tensors of m_sampleLayout
+    //     - adds two additional tensor dimensions, time step and parallel sequence
+    //       These change for each minibatch and are unknown during validation.
+    //     - m_sampleLayout is the tensor shape of the samples
+    //     - m_pMBLayout defines the number of time steps and parallel sequences (="tensor shape" of the minibatch)
+    //       Accessed through GetMBLayout(); test for through HasMBLayout().
+    //
+    // The values can be accessed in three ways:
+    //
+    //  - as a tensor
+    //     - GetTensorShape() forms the joint tensor that incorporates both m_sampleLayout and, if present, m_pMBLayout
+    //        - Elementwise tensor operations operate on these.
+    //        - If no MBLayout is present in one of multiple elementwise operands, it will be interpreted as a one-sample minibatch that broadcasts to all samples.
+    //     - learnable parameters hold tensors that are not minibatches
+    //  - as a sample matrix
+    //     - many nodes do not care about the specific sample-tensor dimensions
+    //     - but may care about selecting a single time step out of a minibatch
+    //     - minibatch: each matrix column contains a sample tensor flattened, with one column per time step and parallel sequence
+    //     - tensor: one column containing the sample tensor flattened
+    //     - GetSampleMatrixNumRows(), GetSampleMatrixNumCols()
+    //  - as a Matrix reference
+    //     - actual object is a 2D tensor without MB Layout
+    //     - ValueAsMatrix(), GradientAsMatrix() returns tensor as a 2D Matrix object
+    //     - nodes that do this are: TimesNode, DiagTimesNode, ConvolutionNode, NoiseContrastiveEstimationNode, ClassBasedCrossEntropyWithSoftmaxNode, TransposeNode, DiagonalNode
+    //
+    // How values are stored:
+    //
+    //  - minibatch: Matrix of columns, where each column is a sample
+    //  - tensor: Matrix where column dimension contains all but the first dimension
+    //     - This only matters for sparse matrices, which cannot easily be Reshaped().
+    //       For those, we keep the underlying storage identical to the semantic meaning.
 
-        // set dimensions of the node
-        // The MBLayout must be set first, and 'isMinibatch' will be checked against it.
-        void SetDims(const TensorShape& sampleLayout, bool isMinibatch)
-        {
-            if (HasMBLayout() != isMinibatch)
-                LogicError("SetDims: MBLayout must be set first, before calling this function, for %ls %ls operation.", NodeName().c_str(), OperationName().c_str());
-            m_sampleLayout = sampleLayout;
-        }
-        // copy dimensions (rows, cols, sample layout) from another node
-        void SetDims(const ComputationNodeBasePtr& node)
-        {
-            SetDims(node->GetSampleLayout(), node->HasMBLayout());
-        }
-        // use this only for testing code. Everywhere else, be explicit on the TensorShape.
-        void SetDims1(size_t rows, size_t cols)
-        {
-            SetDims(TensorShape(rows, cols), false);
-        }
+    // interpretation as a set of samples
+    const TensorShape& GetSampleLayout() const
+    {
+        return m_sampleLayout;
+    }
+    bool HasSampleLayout() const
+    {
+        return m_sampleLayout.GetRank() != 1;
+    } // does it have a layout that is not just a vector?
+
+    // interpretation as sample matrix (each column is a sample, individual sample tensor dimensions do not matter for the operation)
+    size_t GetSampleMatrixNumRows() const
+    {
+        return m_sampleLayout.GetNumElements();
+    }
+    size_t GetSampleMatrixNumCols() const
+    {
+        if (HasMBLayout())
+            return GetMBLayout()->GetNumCols();
+        else
+            return 1; // no layout: treat as 1-sample minibatch that is meant to broadcast
+    }
+    // determine if we are the output of an op over 'other', whether that would be a reduction, so that we need to mask
+    bool ReducesInTimeWrt(const ComputationNodeBasePtr& other) const
+    {
+        return GetSampleMatrixNumCols() < other->GetSampleMatrixNumCols();
+    }
+
+    // interpretation as a Matrix reference
+private:
+    void CheckTensorIsMatrix() const
+    {
+        if (HasMBLayout())
+            LogicError("CheckTensorIsMatrix: Minibatch data cannot be interpreted as a single 2D tensor.");
+        else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
+            LogicError("CheckTensorIsMatrix: Sample is not a column vector or matrix (1D or 2D tensor).");
+    }
+
+public:
+    size_t GetAsMatrixNumRows() const
+    {
+        CheckTensorIsMatrix();
+        return m_sampleLayout[0];
+    }
+    size_t GetAsMatrixNumCols() const
+    {
+        CheckTensorIsMatrix();
+        return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1; // a column vector is also a Matrix
+    }
+
+    // set dimensions of the node
+    // The MBLayout must be set first, and 'isMinibatch' will be checked against it.
+    void SetDims(const TensorShape& sampleLayout, bool isMinibatch)
+    {
+        if (HasMBLayout() != isMinibatch)
+            LogicError("SetDims: MBLayout must be set first, before calling this function, for %ls %ls operation.", NodeName().c_str(), OperationName().c_str());
+        m_sampleLayout = sampleLayout;
+    }
+    // copy dimensions (rows, cols, sample layout) from another node
+    void SetDims(const ComputationNodeBasePtr& node)
+    {
+        SetDims(node->GetSampleLayout(), node->HasMBLayout());
+    }
+    // use this only for testing code. Everywhere else, be explicit on the TensorShape.
+    void SetDims1(size_t rows, size_t cols)
+    {
+        SetDims(TensorShape(rows, cols), false);
+    }
 #if 0
         // deprecated functions that did not distinguish the purpose
         size_t GetNumRows() const { return GetSampleMatrixNumRows(); }
@@ -458,66 +465,69 @@ class ComputationNodeBase : public IComputationNode,
             // actual memory allocation happens elsewhere
         }
 #endif
-        // get number of underlying matrix columns for test code only which does not create MBLayouts
-        size_t GetNumCols1() const { return GetSampleMatrixNumCols(); }    // dummy
-        virtual void NotifyFunctionValuesMBSizeModified() = 0;
-        void VerifyDims(const TensorShape & shape, bool isMinibatch)
-        {
-            if (m_sampleLayout.GetDims() != shape.GetDims() || HasMBLayout() != isMinibatch)
-            {
-                LogicError("VerifyDims: %ls %ls operation expected a %s of [%s], but it is a %s of [%s]",
-                           NodeName().c_str(), OperationName().c_str(),
-                           isMinibatch ?   "minibatch" : "tensor", string(shape).c_str(),
-                           HasMBLayout() ? "minibatch" : "tensor", string(m_sampleLayout).c_str());
-            }
-        }
-        virtual void VerifyDims(ComputationNodeBasePtr node)
+    // get number of underlying matrix columns for test code only which does not create MBLayouts
+    size_t GetNumCols1() const
+    {
+        return GetSampleMatrixNumCols();
+    } // dummy
+    virtual void NotifyFunctionValuesMBSizeModified() = 0;
+    void VerifyDims(const TensorShape& shape, bool isMinibatch)
+    {
+        if (m_sampleLayout.GetDims() != shape.GetDims() || HasMBLayout() != isMinibatch)
         {
-            VerifyDims(node->GetSampleLayout(), node->HasMBLayout());
+            LogicError("VerifyDims: %ls %ls operation expected a %s of [%s], but it is a %s of [%s]",
+                       NodeName().c_str(), OperationName().c_str(),
+                       isMinibatch ? "minibatch" : "tensor", string(shape).c_str(),
+                       HasMBLayout() ? "minibatch" : "tensor", string(m_sampleLayout).c_str());
         }
+    }
+    virtual void VerifyDims(ComputationNodeBasePtr node)
+    {
+        VerifyDims(node->GetSampleLayout(), node->HasMBLayout());
+    }
 
     TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object
 protected:
     size_t DetermineElementwiseTensorRank() const;                          // determine tensor rank when considering all inputs with padding
     TensorShape GetTensorSliceFor(size_t rank, const FrameRange& fr) const; // form tensor shape of the slice referenced by FrameRange
-    public:
-        // access to element(0,0) without having to type-cast
-        virtual double Get00Element() const = 0;
+public:
+    // access to element(0,0) without having to type-cast
+    virtual double Get00Element() const = 0;
 
-        // validation
-        // This is overridden by every node. This base class just checks for unconnected and empty inputs. Overrides must call their base version first.
-        virtual void Validate(bool isFinalValidationPass) // main base validation function
+    // validation
+    // This is overridden by every node. This base class just checks for unconnected and empty inputs. Overrides must call their base version first.
+    virtual void Validate(bool isFinalValidationPass) // main base validation function
+    {
+        // check for NULL pointers
+        for (size_t i = 0; i < m_inputs.size(); i++)
         {
-            // check for NULL pointers
-            for (size_t i = 0; i < m_inputs.size(); i++)
-            {
-                if (!m_inputs[i])
-                    RuntimeError("Validate: Input [%d] of %ls node '%ls' is empty (NULL, not connected).", (int) i, OperationName().c_str(), NodeName().c_str());
-            }
-            // check for empty inputs
-            if (isFinalValidationPass)
-            {
-                for (const auto& child : m_inputs)
-                    if (child->GetSampleMatrixNumRows() == 0)
-                        RuntimeError("%ls %ls operation: input %ls %ls has 0 elements.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
-            }
+            if (!m_inputs[i])
+                RuntimeError("Validate: Input [%d] of %ls node '%ls' is empty (NULL, not connected).", (int) i, OperationName().c_str(), NodeName().c_str());
+        }
+        // check for empty inputs
+        if (isFinalValidationPass)
+        {
+            for (const auto& child : m_inputs)
+                if (child->GetSampleMatrixNumRows() == 0)
+                    RuntimeError("%ls %ls operation: input %ls %ls has 0 elements.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
         }
-        // helper functions for common cases
-    protected:
-        void ValidateUnaryMap(bool isFinalValidationPass);
-        void ValidateUnaryReduce(bool isFinalValidationPass);
-        void ValidateInferBinaryInputDims();
-        void ValidateBinaryZip(bool isFinalValidationPass, bool allowMultiples);
-        void ValidateBinaryReduce(bool isFinalValidationPass);
-
-    public:
+    }
+    // helper functions for common cases
+protected:
+    void ValidateUnaryMap(bool isFinalValidationPass);
+    void ValidateUnaryReduce(bool isFinalValidationPass);
+    void ValidateInferBinaryInputDims();
+    void ValidateBinaryZip(bool isFinalValidationPass, bool allowMultiples);
+    void ValidateBinaryReduce(bool isFinalValidationPass);
+
+public:
     virtual bool UnitTest()
     {
         return true;
     }
 
-        virtual void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs) = 0;
-        // convenience versions that take individual arguments
+    virtual void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs) = 0;
+    // convenience versions that take individual arguments
     void AttachInputs(const ComputationNodeBasePtr& singleInput)
     {
         AttachInputs(std::vector<ComputationNodeBasePtr>{singleInput});
@@ -548,28 +558,28 @@ class ComputationNodeBase : public IComputationNode,
         m_inputs.clear();
     }
 
-        // helper for the factory function for ComputationNodes
-        static vector<ComputationNodeBasePtr> GetInputsFromConfig(const ScriptableObjects::IConfigRecordPtr configp)
-        {
-            vector<ComputationNodeBasePtr> inputs;
+    // helper for the factory function for ComputationNodes
+    static vector<ComputationNodeBasePtr> GetInputsFromConfig(const ScriptableObjects::IConfigRecordPtr configp)
+    {
+        vector<ComputationNodeBasePtr> inputs;
         const auto* inputsArg = configp->Find(L"inputs");
-            if (inputsArg)
-            {
+        if (inputsArg)
+        {
             if (inputsArg->Is<ComputationNodeBase>()) // single arg
-                    inputs.push_back(*inputsArg);
+                inputs.push_back(*inputsArg);
             else // a whole vector
-                {
-                    ScriptableObjects::ConfigArrayPtr inputsArray = *inputsArg;
-                    const auto range = inputsArray->GetIndexRange();
+            {
+                ScriptableObjects::ConfigArrayPtr inputsArray = *inputsArg;
+                const auto range = inputsArray->GetIndexRange();
                 for (int i = range.first; i <= range.second; i++) // pull them. This will resolve all of them.
                     inputs.push_back(inputsArray->At(i, [](const wstring&)
                                                      {
                                                          LogicError("GetInputs: out of bounds index while iterating??");
                                                      }));
-                }
             }
-            return inputs;
         }
+        return inputs;
+    }
 
     const std::vector<ComputationNodeBasePtr>& GetInputs() const
     {
@@ -580,21 +590,21 @@ class ComputationNodeBase : public IComputationNode,
         return m_inputs[index];
     }
 
-        //return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
+    //return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
     virtual bool /*IComputationNode::*/ RequiresPreCompute() const
     {
         return false;
     }
 
-        // casting helpers
+    // casting helpers
     template <typename N>
     N* As()
-        {
-            auto p = dynamic_cast<N*>(this);
-            if (!p)
-                LogicError("Attempted to type-cast node %ls %ls to %s, which is not possible.", NodeName().c_str(), OperationName().c_str(), typeid(N).name());
-            return p;
-        }
+    {
+        auto p = dynamic_cast<N*>(this);
+        if (!p)
+            LogicError("Attempted to type-cast node %ls %ls to %s, which is not possible.", NodeName().c_str(), OperationName().c_str(), typeid(N).name());
+        return p;
+    }
     template <typename N>
     bool Is()
     {
@@ -602,10 +612,10 @@ class ComputationNodeBase : public IComputationNode,
     }
 
     /*HasName::*/ void SetName(const std::wstring& newName) // also for use by ExperimentalNetworkBuilder
-        {
-            m_nodeName = newName;
-            fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
-        }
+    {
+        m_nodeName = newName;
+        fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
+    }
 
     void LinkToMBLayout(MBLayoutPtr pMBLayout)
     {
@@ -625,69 +635,69 @@ class ComputationNodeBase : public IComputationNode,
         return m_nodeName;
     }
 
-        // temporary function that is called to verify stuff is called as I think it is. Delete if this does not fire for a while.
-        void VerifyNumParallelSequences(size_t bsz)
-        {
-            if (bsz != m_pMBLayout->GetNumParallelSequences())
-                LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
-        }
+    // temporary function that is called to verify stuff is called as I think it is. Delete if this does not fire for a while.
+    void VerifyNumParallelSequences(size_t bsz)
+    {
+        if (bsz != m_pMBLayout->GetNumParallelSequences())
+            LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
+    }
 
-    protected:
+protected:
 public: // ...the following should be protected, but nodes inquire about their children, requiring public access
-        size_t GetNumParallelSequences() const
-        {
+    size_t GetNumParallelSequences() const
+    {
 #if 1
         if (!m_pMBLayout) // TODO: temporary workaround to Check_t() calls which call this. TODO: Delete the first arg from Check_t() after memshare merge.
-                return SIZE_MAX;
+            return SIZE_MAX;
 #endif
-            return m_pMBLayout->GetNumParallelSequences();
-        }
+        return m_pMBLayout->GetNumParallelSequences();
+    }
 
-        // get our current number of time steps for this node
-        // This inquires the MB layout.
-        size_t GetNumTimeSteps() const
-        {
-            if (!m_pMBLayout)
-                LogicError("GetNumTimeSteps: invalid to call on a node without MB layout"); // since it has no notion of time
-            return m_pMBLayout->GetNumTimeSteps();
-        }
+    // get our current number of time steps for this node
+    // This inquires the MB layout.
+    size_t GetNumTimeSteps() const
+    {
+        if (!m_pMBLayout)
+            LogicError("GetNumTimeSteps: invalid to call on a node without MB layout"); // since it has no notion of time
+        return m_pMBLayout->GetNumTimeSteps();
+    }
 
 public:
-        // implemented by ComputationNode<ElemType>
-        // for debugging purpose
-        virtual void PrintSelf(bool printMatrices = false) const = 0;
+    // implemented by ComputationNode<ElemType>
+    // for debugging purpose
+    virtual void PrintSelf(bool printMatrices = false) const = 0;
 
-        // called in validation loop right before Validate()
-        virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const
-        {
-            fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
+    // called in validation loop right before Validate()
+    virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const
+    {
+        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
 
-            if (!IsLeaf())
-            {
-                fprintf(stderr, "(");
+        if (!IsLeaf())
+        {
+            fprintf(stderr, "(");
             for (size_t i = 0; i < GetNumInputs(); i++)
-                {
+            {
                 const auto& child = m_inputs[i];
-                    if (i > 0)
-                        fprintf(stderr, ", ");
-
-                    if (child == nullptr)
-                    {
-                        fprintf(stderr, "NULL");
-                        continue;
-                    }
+                if (i > 0)
+                    fprintf(stderr, ", ");
 
-                    const char* mbSizeMark = child->m_pMBLayout ? " x *" : "";
-                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
-                        fprintf(stderr, "%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark,
-                                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]);
-                    // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
-                    else
-                        fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark);
+                if (child == nullptr)
+                {
+                    fprintf(stderr, "NULL");
+                    continue;
                 }
-                fprintf(stderr, ")");
+
+                const char* mbSizeMark = child->m_pMBLayout ? " x *" : "";
+                if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
+                    fprintf(stderr, "%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark,
+                            child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]);
+                // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
+                else
+                    fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark);
             }
+            fprintf(stderr, ")");
         }
+    }
 
     const std::wstring& NodeName() const
     {
@@ -724,251 +734,251 @@ class ComputationNodeBase : public IComputationNode,
     {
         m_outputNeededDuringBackprop = f;
     }
-        bool IsOutputNeededDuringBackprop() const 
-        {
+    bool IsOutputNeededDuringBackprop() const
+    {
         return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
-        }
+    }
 
     const size_t GetNumInputs() const
     {
         return m_inputs.size();
     }
 
-        virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
+    virtual void SetInput(const size_t childIndex, const ComputationNodeBasePtr& node) = 0;
 
-        // masking
-        // overridden by <ElemType> variant only
+    // masking
+    // overridden by <ElemType> variant only
     virtual void MaskMissingValueColumnsToZero(const FrameRange&) = 0;
     virtual void MaskMissingGradientColumnsToZero(const FrameRange&) = 0;
     virtual void InvalidateMissingValueColumns(const FrameRange&) = 0;
     virtual void InvalidateMissingGradientColumns(const FrameRange&) = 0;
 
-        virtual void ZeroGradientsOfInputs() = 0;
+    virtual void ZeroGradientsOfInputs() = 0;
 
     virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
-        {
+    {
 #ifdef TRACK_GAP_NANS
-            fprintf(stderr, "BeginForwardProp: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+        fprintf(stderr, "BeginForwardProp: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
 #endif
-        }
+    }
     virtual void /*IComputationNode::*/ EndForwardProp() override // called after last iteration step of ForwardProp()
-        {
+    {
 #ifdef TRACK_GAP_NANS
-            fprintf(stderr, "EndForwardProp: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+        fprintf(stderr, "EndForwardProp: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
 #endif
-        }
-        // TODO: the following two are not really utilized yet other than printing trace information
+    }
+    // TODO: the following two are not really utilized yet other than printing trace information
     virtual void /*IComputationNode::*/ BeginBackprop() override // called before first iteration step of ComputeGradient()
-        {
+    {
 #ifdef TRACK_GAP_NANS
-            fprintf(stderr, "BeginBackprop: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+        fprintf(stderr, "BeginBackprop: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
 #endif
-        }
+    }
     virtual void /*IComputationNode::*/ EndBackprop() override // called after last iteration step of ComputeGradient()
-        {
+    {
 #ifdef TRACK_GAP_NANS
-            fprintf(stderr, "EndBackprop: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+        fprintf(stderr, "EndBackprop: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
 #endif
-        }
+    }
 
-        // Is the output value of the computation node needed for computing 
-        // gradients of any of the input nodes
-        // Base-class version makes conservative assumption that it is. Override if not.
-        virtual bool OutputUsedInComputingInputNodesGradients() const
-        {
-            return true;
-        }
+    // Is the output value of the computation node needed for computing
+    // gradients of any of the input nodes
+    // Base-class version makes conservative assumption that it is. Override if not.
+    virtual bool OutputUsedInComputingInputNodesGradients() const
+    {
+        return true;
+    }
 
-        // Is the output value of the specified  input node needed for computing
-        // gradients of any of the input nodes
-        // Base-class version makes conservative assumption that it is. Override if not.
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const
-        {
-            UNREFERENCED_PARAMETER(childIndex);
-            return true;
-        }
+    // Is the output value of the specified  input node needed for computing
+    // gradients of any of the input nodes
+    // Base-class version makes conservative assumption that it is. Override if not.
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const
+    {
+        UNREFERENCED_PARAMETER(childIndex);
+        return true;
+    }
 
-    public:
-        virtual void ValidateInferInputDimsFrom(const TensorShape &) = 0;
+public:
+    virtual void ValidateInferInputDimsFrom(const TensorShape&) = 0;
 
-    protected:
+protected:
     const TensorShape& GetInputSampleLayout(const size_t index) const
-        {
-            return m_inputs[index]->GetSampleLayout();
-        }
+    {
+        return m_inputs[index]->GetSampleLayout();
+    }
 
-        void InferMBLayoutFromInputsForStandardCase();
+    void InferMBLayoutFromInputsForStandardCase();
 
-    public:
-        bool IsEqualTo(const ComputationNodeBasePtr& other) const //this will be used to determine whehter two nodes are the same
-        {
-            if (OperationName() != other->OperationName() || m_inputs.size() != other->m_inputs.size())
-                return false;
+public:
+    bool IsEqualTo(const ComputationNodeBasePtr& other) const //this will be used to determine whehter two nodes are the same
+    {
+        if (OperationName() != other->OperationName() || m_inputs.size() != other->m_inputs.size())
+            return false;
 
         if (NodeName() == other->NodeName()) //assume names are unique in the system
-                return true;
+            return true;
 
         if (IsLeaf() && other->IsLeaf()) //since names are not equal otherwise will return above
-                return false;
+            return false;
 
         for (size_t i = 0; i < m_inputs.size(); i++)
-                if (!(m_inputs[i] == other->m_inputs[i]))
-                    return false;
+            if (!(m_inputs[i] == other->m_inputs[i]))
+                return false;
 
-            return true;
-        }
+        return true;
+    }
 
-        // determine enumeration order for everything needed to evaluate this node (and its children)
-        // This creates a list such that children are evaluated before their parents.
-        // If !forForwardProp then the order will be reversed, suitable for backprop.
-        // The 'recurrent' version is only called from FormRecurrentLoops().
-        // TODO: This should be a method of ComputationNetwork, not ComputationNode.
+    // determine enumeration order for everything needed to evaluate this node (and its children)
+    // This creates a list such that children are evaluated before their parents.
+    // If !forForwardProp then the order will be reversed, suitable for backprop.
+    // The 'recurrent' version is only called from FormRecurrentLoops().
+    // TODO: This should be a method of ComputationNetwork, not ComputationNode.
     static std::list<ComputationNodeBasePtr> EnumerateNodes(const std::vector<ComputationNodeBasePtr>& allRoots, bool skipPairNetwork = false /*legacy*/)
-        {
-            std::list<ComputationNodeBasePtr> nodes;
-            std::unordered_set<ComputationNodeBasePtr> visited;
+    {
+        std::list<ComputationNodeBasePtr> nodes;
+        std::unordered_set<ComputationNodeBasePtr> visited;
 
         for (const auto& root : allRoots)
             root->EnumerateNodesRec(visited, nodes, skipPairNetwork); // call into the recursive portion of this function below
 
-            return nodes;
-        }
+        return nodes;
+    }
 
-        // and a version that does it for only one root 'this'
+    // and a version that does it for only one root 'this'
     std::list<ComputationNodeBasePtr> EnumerateNodes(bool skipPairNetwork) /*const*/
     {
         return EnumerateNodes(std::vector<ComputationNodeBasePtr>{shared_from_this()}, skipPairNetwork);
     }
 
-    private:
-        // Recursive part of EnumerateNodes().
-        void EnumerateNodesRec(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool skipPairNetwork) /*const*/ // const not working due to shared_from_this()
-        {
+private:
+    // Recursive part of EnumerateNodes().
+    void EnumerateNodesRec(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool skipPairNetwork) /*const*/ // const not working due to shared_from_this()
+    {
         if (visited.find(shared_from_this()) == visited.end()) // do not include a node twice
-            {
+        {
             visited.insert(shared_from_this()); // have visited tagged here to avoid infinite loop over children, children's children, etc
 
-                // children first for function evaluation
+            // children first for function evaluation
             if (OperationName() != L"PairNetwork" || !skipPairNetwork) // (don't step through network-pair boundary if called from FormRecurrentLoops())
+            {
+                for (int i = 0; i < m_inputs.size(); i++)
                 {
-                    for (int i = 0; i < m_inputs.size(); i++)
-                    {
-                        if (m_inputs[i])
-                            m_inputs[i]->EnumerateNodesRec(visited, result, skipPairNetwork);
-                    }
+                    if (m_inputs[i])
+                        m_inputs[i]->EnumerateNodesRec(visited, result, skipPairNetwork);
                 }
-
-                // now that all children are in list before us, put ourselves
-                result.push_back(shared_from_this());
             }
+
+            // now that all children are in list before us, put ourselves
+            result.push_back(shared_from_this());
         }
+    }
 
 public:
-        // check whether a node is up-to-date w.r.t. its children, for lazy evaluation
-        // If this returns false, node must be evaluated to update m_value.
-        // BUGBUG: The function name is incorrect. It also returns 'true' if a child has the same time stamp (not older).
-        // This is virtual because it is overridden by traversal nodes.
-        virtual bool IsOutputOlderThanInputs() const
+    // check whether a node is up-to-date w.r.t. its children, for lazy evaluation
+    // If this returns false, node must be evaluated to update m_value.
+    // BUGBUG: The function name is incorrect. It also returns 'true' if a child has the same time stamp (not older).
+    // This is virtual because it is overridden by traversal nodes.
+    virtual bool IsOutputOlderThanInputs() const
+    {
+        // TODO: use range-based for
+        for (size_t i = 0; i < GetNumInputs(); i++)
         {
-            // TODO: use range-based for
-            for (size_t i = 0; i < GetNumInputs(); i++)
-            {
-                if (IsOlderThan(*m_inputs[i]))
-                    return true;
-            }
-
-            return false;
+            if (IsOlderThan(*m_inputs[i]))
+                return true;
         }
 
-        typedef std::pair<ComputationNodeBasePtr, ComputationNodeBasePtr> ComputationArc;
-        // [1/13/2015 erw] add to enumerate all the edges 
-        // enumerate arcs that can be reached starting from the current node's children
-        // [in/out] visited record already visited nodes 
-        // TODO: This should be a method of ComputationNetwork, not ComputationNode.
-        void EnumerateArcs(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationArc>& arcs)
-        {
+        return false;
+    }
+
+    typedef std::pair<ComputationNodeBasePtr, ComputationNodeBasePtr> ComputationArc;
+    // [1/13/2015 erw] add to enumerate all the edges
+    // enumerate arcs that can be reached starting from the current node's children
+    // [in/out] visited record already visited nodes
+    // TODO: This should be a method of ComputationNetwork, not ComputationNode.
+    void EnumerateArcs(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationArc>& arcs)
+    {
         std::list<ComputationNodeBasePtr> tovisit;
 
-            if (visited.find(shared_from_this()) == visited.end()) // only do when this node has not been visited before
+        if (visited.find(shared_from_this()) == visited.end()) // only do when this node has not been visited before
+        {
+            tovisit.push_back(shared_from_this());
+
+            while (!tovisit.empty())
             {
-                tovisit.push_back(shared_from_this());
+                ComputationNodeBasePtr curNode = tovisit.front();
+                tovisit.pop_front();
 
-                while (!tovisit.empty())
+                if (visited.find(curNode) == visited.end())
                 {
-                    ComputationNodeBasePtr curNode = tovisit.front();
-                    tovisit.pop_front();
-
-                    if (visited.find(curNode) == visited.end())
+                    for (size_t i = 0; i < curNode->m_inputs.size(); i++)
                     {
-                        for (size_t i = 0; i < curNode->m_inputs.size(); i++)
-                        {
-                            arcs.push_back(ComputationArc(curNode, curNode->m_inputs[i]));
+                        arcs.push_back(ComputationArc(curNode, curNode->m_inputs[i]));
 
-                            if (visited.find(curNode->m_inputs[i]) == visited.end()) // this children has not been visited before 
+                        if (visited.find(curNode->m_inputs[i]) == visited.end()) // this children has not been visited before
                             tovisit.push_front(curNode->m_inputs[i]);            // going to visit each of the children
-                        }
-                        visited.insert(curNode);
                     }
+                    visited.insert(curNode);
                 }
             }
         }
+    }
 
-        std::wstring CreateUniqNodeName() const
-        {
+    std::wstring CreateUniqNodeName() const
+    {
 #ifdef USE_GUID_AS_NAME
-            UUID uuid;
-            ZeroMemory(&uuid, sizeof(UUID));
-            std::wstring name;
+        UUID uuid;
+        ZeroMemory(&uuid, sizeof(UUID));
+        std::wstring name;
 
-            UuidCreate(&uuid);
-            WCHAR* szUuid = nullptr;
+        UuidCreate(&uuid);
+        WCHAR* szUuid = nullptr;
         if (UuidToStringW(&uuid, (RPC_WSTR*) &szUuid) != RPC_S_OK)
-                RuntimeError("Failed to craete unique node name.");
-            else
-            {
-                name = szUuid;
+            RuntimeError("Failed to craete unique node name.");
+        else
+        {
+            name = szUuid;
             RpcStringFreeW((RPC_WSTR*) &szUuid);
-            }
+        }
 #else
-            int64_t id = CreateUniqId();
-            std::wstring base = L"AutoName";
-            std::wstringstream sstm;
-            sstm << base.c_str() << id;
-            std::wstring name = sstm.str();
-            //msra::strfun::wstrprintf name(L"%s%d", L"AutoName", id);
+        int64_t id = CreateUniqId();
+        std::wstring base = L"AutoName";
+        std::wstringstream sstm;
+        sstm << base.c_str() << id;
+        std::wstring name = sstm.str();
+//msra::strfun::wstrprintf name(L"%s%d", L"AutoName", id);
 #endif
 
-            return name;
-        }
+        return name;
+    }
 
-    protected:
+protected:
     DEVICEID_TYPE m_deviceId; // CPU=-1, >=0 GPU
-        std::wstring m_nodeName;
+    std::wstring m_nodeName;
 
-        // inputs
-        std::vector<ComputationNodeBasePtr> m_inputs;
+    // inputs
+    std::vector<ComputationNodeBasePtr> m_inputs;
 
-        // dimensions and layout
-        // Data is stored as a Matrix object, but often it is interpreted as a tensor.
-        // For nodes that carry data (samples), each sample is a column of the matrix, which is interpreted as
-        // a tensor (n-dimensional array) described by m_sampleLayout. The MBLayout describes the meaning
-        // of the column index.
-        // For nodes that do not carry data, the last tensor index of m_sampleLayout is the number of columns.
-        TensorShape m_sampleLayout; // sample layout
-        MBLayoutPtr m_pMBLayout;
+    // dimensions and layout
+    // Data is stored as a Matrix object, but often it is interpreted as a tensor.
+    // For nodes that carry data (samples), each sample is a column of the matrix, which is interpreted as
+    // a tensor (n-dimensional array) described by m_sampleLayout. The MBLayout describes the meaning
+    // of the column index.
+    // For nodes that do not carry data, the last tensor index of m_sampleLayout is the number of columns.
+    TensorShape m_sampleLayout; // sample layout
+    MBLayoutPtr m_pMBLayout;
 
-        // flags related to gradient propagation
+    // flags related to gradient propagation
     bool m_parameterUpdateRequired;    // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
     bool m_gradientInitialized;        // indicates whether the gradient matrix has been resized and initialized to 0
     bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop
-    };
-    typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
+};
+typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
-    // =======================================================================
-    // ComputationNode -- abstract base class for computation nodes, deriving from CompuationNodeBase, parameterized by float vs. double
-    // =======================================================================
+// =======================================================================
+// ComputationNode -- abstract base class for computation nodes, deriving from CompuationNodeBase, parameterized by float vs. double
+// =======================================================================
 
-    // little helper class to allow derived Node classes to specify how many inputs they expect
+// little helper class to allow derived Node classes to specify how many inputs they expect
 struct INumInputs
 {
     virtual size_t GetExpectedNumInputs() const = 0;
@@ -983,29 +993,29 @@ struct NumInputs : public INumInputs
 }; // e.g. derive from NumInputs<2>
 
 template <class ElemType>
-    class ComputationNode : public ComputationNodeBase // abstract class that cannot be instantiated
-    {
-        typedef ComputationNodeBase Base;
+class ComputationNode : public ComputationNodeBase // abstract class that cannot be instantiated
+{
+    typedef ComputationNodeBase Base;
 
-    protected:
-        //std containers such as list and map does not support class reference so we need to use pointer
-        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
+protected:
+    //std containers such as list and map does not support class reference so we need to use pointer
+    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
 
-    public:
+public:
     using ComputationNodeBase::AttachInputs; // import the convenience functions that take 1..6 parameters
-        using ComputationNodeBase::SetDims;
-        typedef ElemType OurElemType;
+    using ComputationNodeBase::SetDims;
+    typedef ElemType OurElemType;
 
-        // public constructor
-        // Note: use the New<> helper function that is declared next, which gives you the convenience of returning a shared_ptr
+    // public constructor
+    // Note: use the New<> helper function that is declared next, which gives you the convenience of returning a shared_ptr
     ComputationNode(DEVICEID_TYPE deviceId, const wstring& name)
         : ComputationNodeBase(deviceId, name)
     {
     }
 
-        // creation from configuration
-        // Nodes with NumInputs<> should say DeclareConstructorFromConfigWithNumInputs(ClassName), and nodes without DeclareConstructorFromConfig(ClassName).
-        // The macro will forward to the regular constructor of the node (which may do more than just calling the base constructor), and then attach the inputs from config.
+// creation from configuration
+// Nodes with NumInputs<> should say DeclareConstructorFromConfigWithNumInputs(ClassName), and nodes without DeclareConstructorFromConfig(ClassName).
+// The macro will forward to the regular constructor of the node (which may do more than just calling the base constructor), and then attach the inputs from config.
 #define DeclareConstructorFromConfig(C)                  \
     C(const ScriptableObjects::IConfigRecordPtr configp) \
         : C(configp->Get(L"deviceId"), L"<placeholder>") \
@@ -1019,259 +1029,261 @@ template <class ElemType>
         AttachInputs(configp, this->GetExpectedNumInputs()); \
     }
 
-        // helper to load m_value from a stream
-        // This function updates the dimensions to a 2D matrix.
-        // If a different tensor layout is associated with this, it must be implanted afterwards.
-        // Nodes that call this never have an MB layout.
-        void LoadValue(File& fstream)
-        {
-            CreateMatrixIfNull(m_value);
-            fstream >> Value();
-            // above reads dimensions, so we must update our own dimensions
-            SetDims(TensorShape(Value().GetNumRows(), Value().GetNumCols()), false);
-        }
+    // helper to load m_value from a stream
+    // This function updates the dimensions to a 2D matrix.
+    // If a different tensor layout is associated with this, it must be implanted afterwards.
+    // Nodes that call this never have an MB layout.
+    void LoadValue(File& fstream)
+    {
+        CreateMatrixIfNull(m_value);
+        fstream >> Value();
+        // above reads dimensions, so we must update our own dimensions
+        SetDims(TensorShape(Value().GetNumRows(), Value().GetNumCols()), false);
+    }
 
-        // reader updated m_functionValue and MBLayout--ensure our internal state is consistent
-        virtual void NotifyFunctionValuesMBSizeModified() override final
-        {
-            if (!HasMBLayout())
-                LogicError("NotifyFunctionValuesMBSizeModified: Must only be called on nodes with MBLayout.");
-            if (GetSampleMatrixNumRows() != Value().GetNumRows())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its row dimension %d changed by the reader to %d.", NodeName().c_str(), OperationName().c_str(), (int)GetSampleMatrixNumRows(), (int)Value().GetNumRows());
-            if (GetMBLayout()->GetNumCols() != Value().GetNumCols())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its col dimension %d changed by the reader to %d, but different from MBLayout.", NodeName().c_str(), OperationName().c_str(), (int)GetMBLayout()->GetNumCols(), (int)Value().GetNumCols());
-        }
-        virtual double Get00Element() const override final
-        {
-            // TODO: Are all these meant to read out a scalar? Then rename and verify dimensions.
-            return Value().Get00Element();
-        }
+    // reader updated m_functionValue and MBLayout--ensure our internal state is consistent
+    virtual void NotifyFunctionValuesMBSizeModified() override final
+    {
+        if (!HasMBLayout())
+            LogicError("NotifyFunctionValuesMBSizeModified: Must only be called on nodes with MBLayout.");
+        if (GetSampleMatrixNumRows() != Value().GetNumRows())
+            LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its row dimension %d changed by the reader to %d.", NodeName().c_str(), OperationName().c_str(), (int) GetSampleMatrixNumRows(), (int) Value().GetNumRows());
+        if (GetMBLayout()->GetNumCols() != Value().GetNumCols())
+            LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its col dimension %d changed by the reader to %d, but different from MBLayout.", NodeName().c_str(), OperationName().c_str(), (int) GetMBLayout()->GetNumCols(), (int) Value().GetNumCols());
+    }
+    virtual double Get00Element() const override final
+    {
+        // TODO: Are all these meant to read out a scalar? Then rename and verify dimensions.
+        return Value().Get00Element();
+    }
 
-        // recover a shared_ptr from ourselves if given a naked pointer
-        ComputationNodePtr shared_from_this()
-        {
-            return dynamic_pointer_cast<ComputationNode<ElemType>>(ComputationNodeBase::shared_from_this());
-        }
+    // recover a shared_ptr from ourselves if given a naked pointer
+    ComputationNodePtr shared_from_this()
+    {
+        return dynamic_pointer_cast<ComputationNode<ElemType>>(ComputationNodeBase::shared_from_this());
+    }
 
-        // recover a ComputationNodePtr (which is a shared_ptr) from a naked pointer to our base type (ComputationNodeBase) stored as a void* (old NDL parser does that)
+    // recover a ComputationNodePtr (which is a shared_ptr) from a naked pointer to our base type (ComputationNodeBase) stored as a void* (old NDL parser does that)
     static ComputationNodePtr FromVoidPtr(void* vp)
-        {
+    {
         auto p = dynamic_cast<ComputationNode<ElemType>*>((ComputationNodeBase*) vp); // TODO: check that all void* casts really come from ComputationNodeBasePtr; or add a method ToVoidPtr(). Or get rid of the void*?!
-            return p->shared_from_this();
-        }
+        return p->shared_from_this();
+    }
 
-        // AttachInputs() -- attach the inputs of a node
-        // This verifies the number of inputs. For that, nodes with fixed number of inputs derive from NumInputs<N>.
-        // This function discovers this through RTTI and performs a runtime check. Nodes should not have additional checks in their implementation (save the code).
-        // Note: Nodes with variable number of inputs will not derive from NumInputs<>, but instead check their inputs in Validate().
-        void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs)
-        {
+    // AttachInputs() -- attach the inputs of a node
+    // This verifies the number of inputs. For that, nodes with fixed number of inputs derive from NumInputs<N>.
+    // This function discovers this through RTTI and performs a runtime check. Nodes should not have additional checks in their implementation (save the code).
+    // Note: Nodes with variable number of inputs will not derive from NumInputs<>, but instead check their inputs in Validate().
+    void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs)
+    {
 #ifdef _DEBUG
         wstring name = NodeName();
         name; // (for easier debugging)
 #endif
         const auto* pNumInputs = dynamic_cast<INumInputs*>(this); // if this class also derives from NumInputs<N> then N is the expected number of inputs
-            if (pNumInputs && pNumInputs->GetExpectedNumInputs() != inputs.size())
+        if (pNumInputs && pNumInputs->GetExpectedNumInputs() != inputs.size())
             RuntimeError("%ls operation '%ls' expects %d inputs (given: %d)", OperationName().c_str(), NodeName().c_str(), (int) pNumInputs->GetExpectedNumInputs(), (int) inputs.size());
-            m_inputs.resize(inputs.size());
-            for (size_t i = 0; i < m_inputs.size(); i++)
-                if (inputs[i])
+        m_inputs.resize(inputs.size());
+        for (size_t i = 0; i < m_inputs.size(); i++)
+            if (inputs[i])
                 m_inputs[i] = UpCast(inputs[i]); // (UpCast() checks the type; the assignment then downcasts it again)
-                else
+            else
                 m_inputs[i] = nullptr; // during network creation, nullpts are possible
-        }
+    }
 
-    protected:
-        // AttachInputs() from config
-        void AttachInputs(const ScriptableObjects::IConfigRecordPtr configp, size_t expectedNumInputs = SIZE_MAX)
+protected:
+    // AttachInputs() from config
+    void AttachInputs(const ScriptableObjects::IConfigRecordPtr configp, size_t expectedNumInputs = SIZE_MAX)
+    {
+        const auto inputs = GetInputsFromConfig(configp);
+        if (expectedNumInputs != SIZE_MAX)
         {
-            const auto inputs = GetInputsFromConfig(configp);
-            if (expectedNumInputs != SIZE_MAX)
+            if (inputs.size() != expectedNumInputs)
             {
-                if (inputs.size() != expectedNumInputs)
-                {
-                    // print an error. For that, find at least one argument
+                // print an error. For that, find at least one argument
                 auto* val = configp->Find(L"inputs");
                 if (!val) // if there is no 'inputs' then get the first item of this config record for a Fail() function
-                    {
-                        auto members = configp->GetMemberIds();
-                        if (members.size() > 0)
-                            val = configp->Find(members.front());
-                    }
-                    if (val)
+                {
+                    auto members = configp->GetMemberIds();
+                    if (members.size() > 0)
+                        val = configp->Find(members.front());
+                }
+                if (val)
                     val->Fail(msra::strfun::wstrprintf(L"Expected %d inputs, but %d were given.", (int) expectedNumInputs, (int) inputs.size()));
-                    else
+                else
                     InvalidArgument("Expected %d inputs, but %d were given.", (int) expectedNumInputs, (int) inputs.size());
-                }
             }
-            AttachInputs(inputs);
         }
+        AttachInputs(inputs);
+    }
 
-    public:
-        //request matrices needed to do node function value evaluation
-        virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
-        {
-            RequestMatrixFromPool(m_value, matrixPool);
-        }
+public:
+    //request matrices needed to do node function value evaluation
+    virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
+    {
+        RequestMatrixFromPool(m_value, matrixPool);
+    }
 
-        //release temp matrices that are only used by forward computation
-        //don't release matrices that need to be used in the gradient computation
-        virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
-        {
-            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
-                ReleaseMatrixToPool(m_value, matrixPool);
-        }
+    //release temp matrices that are only used by forward computation
+    //don't release matrices that need to be used in the gradient computation
+    virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
+    {
+        if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
+            ReleaseMatrixToPool(m_value, matrixPool);
+    }
 
-        virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
+    virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
+    {
+        for (int i = 0; i < m_inputs.size(); i++)
         {
-            for (int i = 0; i < m_inputs.size(); i++)
-            {
-                if (m_inputs[i]->NeedGradient())
-                    m_inputs[i]->RequestMatricesBeforeBackprop(matrixPool);
-            }
+            if (m_inputs[i]->NeedGradient())
+                m_inputs[i]->RequestMatricesBeforeBackprop(matrixPool);
         }
+    }
 
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-        {
-            RequestMatrixFromPool(m_gradient, matrixPool);
-        }
+    //request matrices that are needed for gradient computation
+    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+    {
+        RequestMatrixFromPool(m_gradient, matrixPool);
+    }
 
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        if (!IsLeaf() && !RequiresPreCompute())
         {
-            if (!IsLeaf() && !RequiresPreCompute())
-            {
             if (m_gradient != nullptr && m_gradient->GetMatrixType() != SPARSE) //since we don't have a sparse pool yet
-                    ReleaseMatrixToPool(m_gradient, matrixPool);
+                ReleaseMatrixToPool(m_gradient, matrixPool);
 
-                // Release the Value matrix only if the output value is needed during backprop
-                // since in the case it isn't used, we release it during forward prop itself
-                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
-                    ReleaseMatrixToPool(m_value, matrixPool);
-            }
+            // Release the Value matrix only if the output value is needed during backprop
+            // since in the case it isn't used, we release it during forward prop itself
+            if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
+                ReleaseMatrixToPool(m_value, matrixPool);
         }
+    }
 
-        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const;
+    virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const;
 
-        // TODO: similar to DumpInfo; used by ExperimentalNetworkBuilder test implementation
-        /*HasToString::*/ wstring ToString() const
+    // TODO: similar to DumpInfo; used by ExperimentalNetworkBuilder test implementation
+    /*HasToString::*/ wstring ToString() const
+    {
+        // we format it like "name : type rows x cols ( args )"
+        wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
+        result.append(msra::strfun::wstrprintf(L" [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""));
+        if (m_inputs.empty())
+            result.append(L" ()");
+        else
         {
-            // we format it like "name : type rows x cols ( args )"
-            wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
-            result.append(msra::strfun::wstrprintf(L" [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""));
-            if (m_inputs.empty())
-                result.append(L" ()");
-            else
+            wstring args;
+            bool first = true;
+            for (auto& child : m_inputs)
             {
-                wstring args;
-                bool first = true;
-                for (auto& child : m_inputs)
-                {
-                    if (first)
-                        first = false;
-                    else
-                        args.append(L"\n");
-                    args.append(/*TidyName*/ (child->NodeName()));
-                }
-                result += L" " + NestString(args, L'(', true, ')');
+                if (first)
+                    first = false;
+                else
+                    args.append(L"\n");
+                args.append(/*TidyName*/ (child->NodeName()));
             }
-            return result;
+            result += L" " + NestString(args, L'(', true, ')');
         }
+        return result;
+    }
 
-        // update temporary variables of a node to match MBLayout
-        virtual void UpdateFunctionMBSize() override { }
+    // update temporary variables of a node to match MBLayout
+    virtual void UpdateFunctionMBSize() override
+    {
+    }
 
-        void ValidateInferInputDimsFrom(const TensorShape & otherShape);
+    void ValidateInferInputDimsFrom(const TensorShape& otherShape);
 
-    public:
+public:
     static void MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, const MBLayoutPtr& pMBLayout, const FrameRange& fr)
-        {
-            //fprintf(stderr, "masking column range %d\n", (int)fr.timeIdxInSeq);
+    {
+        //fprintf(stderr, "masking column range %d\n", (int)fr.timeIdxInSeq);
         MaskMissingColumnsTo(matrixToBeMasked, pMBLayout, fr, (ElemType) 0);
-        }
+    }
 
     void /*ComputationNodeBase::*/ MaskMissingValueColumnsToZero(const FrameRange& fr) override final
-        {
-            //fprintf(stderr, "%ls %ls m_value ", NodeName().c_str(), OperationName().c_str());
-            MaskMissingColumnsToZero(*m_value, m_pMBLayout, fr);
-        }
+    {
+        //fprintf(stderr, "%ls %ls m_value ", NodeName().c_str(), OperationName().c_str());
+        MaskMissingColumnsToZero(*m_value, m_pMBLayout, fr);
+    }
     void /*ComputationNodeBase::*/ MaskMissingGradientColumnsToZero(const FrameRange& fr) override final
-        {
-            //fprintf(stderr, "%ls %ls m_gradient ", NodeName().c_str(), OperationName().c_str());
-            MaskMissingColumnsToZero(*m_gradient, m_pMBLayout, fr);
-        }
+    {
+        //fprintf(stderr, "%ls %ls m_gradient ", NodeName().c_str(), OperationName().c_str());
+        MaskMissingColumnsToZero(*m_gradient, m_pMBLayout, fr);
+    }
 
-        // for debugging, set the gaps to NaN instead (to track whether it bubbles up somewhere)
+    // for debugging, set the gaps to NaN instead (to track whether it bubbles up somewhere)
     void InvalidateMissingValueColumns(const FrameRange& fr) override final
-        {
-            //fprintf(stderr, "invalidating %ls %ls m_value column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
-            MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
-        }
+    {
+        //fprintf(stderr, "invalidating %ls %ls m_value column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
+        MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
+    }
     void InvalidateMissingGradientColumns(const FrameRange& fr) override final
-        {
-            //fprintf(stderr, "invalidating %ls %ls m_gradient column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
-            MaskMissingColumnsTo(*m_gradient, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
-        }
+    {
+        //fprintf(stderr, "invalidating %ls %ls m_gradient column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
+        MaskMissingColumnsTo(*m_gradient, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
+    }
 
-        // for debugging purposes
+    // for debugging purposes
     void /*ComputationNodeBase::*/ PrintSelf(bool printMatrices = false) const
-        {
-            fprintf(stderr, "\n%ls[%s%s] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "", OperationName().c_str());
+    {
+        fprintf(stderr, "\n%ls[%s%s] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "", OperationName().c_str());
 
-            if (!IsLeaf())
-            {
-                fprintf(stderr, "(");           
+        if (!IsLeaf())
+        {
+            fprintf(stderr, "(");
             for (size_t i = 0; i < GetNumInputs(); i++)
-                {
-                    if (i > 0)
-                        fprintf(stderr, ", ");           
-                    fprintf(stderr, "%ls[%s%s] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->HasMBLayout() ? " x *" : "", OperationName().c_str());
-                }
-                fprintf(stderr, ")");           
-            }
-
-            if (printMatrices)
             {
-            fprintf(stderr, "\n    $$$$ Function Values\n");
-                Value().Print("FunctionValue");
-
-            fprintf(stderr, "\n    $$$$ Gradient Values\n");
-                Gradient().Print("GradientValue");
+                if (i > 0)
+                    fprintf(stderr, ", ");
+                fprintf(stderr, "%ls[%s%s] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->HasMBLayout() ? " x *" : "", OperationName().c_str());
             }
+            fprintf(stderr, ")");
         }
 
-        // up-cast to make life easier
-        static ComputationNodePtr UpCast(ComputationNodeBasePtr inode)
+        if (printMatrices)
         {
-            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(inode);
-            if (!node)
-                InvalidArgument("an ComputationNodeBasePtr of mismatching precision was passed");
-            return node;
+            fprintf(stderr, "\n    $$$$ Function Values\n");
+            Value().Print("FunctionValue");
+
+            fprintf(stderr, "\n    $$$$ Gradient Values\n");
+            Gradient().Print("GradientValue");
         }
+    }
 
-        inline ComputationNodePtr Input(const size_t inputIndex) const
-        {
-            if (inputIndex >= m_inputs.size())
+    // up-cast to make life easier
+    static ComputationNodePtr UpCast(ComputationNodeBasePtr inode)
+    {
+        ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(inode);
+        if (!node)
+            InvalidArgument("an ComputationNodeBasePtr of mismatching precision was passed");
+        return node;
+    }
+
+    inline ComputationNodePtr Input(const size_t inputIndex) const
+    {
+        if (inputIndex >= m_inputs.size())
             LogicError("Inputs: inputIndex %d is out of range for %ls %ls operation.", (int) inputIndex, NodeName().c_str(), OperationName().c_str());
-            return UpCast(m_inputs[inputIndex]);
-        }
+        return UpCast(m_inputs[inputIndex]);
+    }
 
     void /*ComputationNodeBase::*/ SetInput(const size_t childIndex, const ComputationNodeBasePtr& inode) override
-        {
-            const ComputationNodePtr node = UpCast(inode);
+    {
+        const ComputationNodePtr node = UpCast(inode);
 
-            //require first nodes specified before the second to avoid null nodes condition.
-            if (childIndex > m_inputs.size())
-                InvalidArgument("SetInput: You must specify the input for children with index less than this one first.");
+        //require first nodes specified before the second to avoid null nodes condition.
+        if (childIndex > m_inputs.size())
+            InvalidArgument("SetInput: You must specify the input for children with index less than this one first.");
 
-            // expand the inputs to exist up to the desired index
-            while (childIndex >= m_inputs.size())
-                m_inputs.push_back(nullptr);
+        // expand the inputs to exist up to the desired index
+        while (childIndex >= m_inputs.size())
+            m_inputs.push_back(nullptr);
 
-            // set the input value
-            m_inputs[childIndex] = node;
-        }
+        // set the input value
+        m_inputs[childIndex] = node;
+    }
 
     const Matrix<ElemType>& Value() const
     {
@@ -1290,10 +1302,11 @@ template <class ElemType>
     {
         return *m_gradient;
     }
+
 private:
     // map a tensor to a matrix
     // The leading dimension maps to rows, the rest to columns, for compat with sparse matrix lib.
-    Matrix<ElemType> & TensorAsMatrix(Matrix<ElemType> & data)
+    Matrix<ElemType>& TensorAsMatrix(Matrix<ElemType>& data)
     {
         size_t numRows = GetAsMatrixNumRows();
         size_t numCols = GetAsMatrixNumCols();
@@ -1301,18 +1314,19 @@ template <class ElemType>
         data.VerifySize(numRows, numCols);
         return data;
     }
+
 public:
-    Matrix<ElemType> & ValueAsMatrix()
+    Matrix<ElemType>& ValueAsMatrix()
     {
         return TensorAsMatrix(*m_value);
     }
-    Matrix<ElemType> & GradientAsMatrix()
+    Matrix<ElemType>& GradientAsMatrix()
     {
         return TensorAsMatrix(*m_gradient);
     }
 
-    public:
-#if 0   // only used for old implementation of PlusNode
+public:
+#if 0 // only used for old implementation of PlusNode
         // Function to return the number of columns for whole batch or single frame
     size_t GetNumColsFor(const FrameRange& fr /*select frame or entire batch*/)
         {
@@ -1327,138 +1341,135 @@ template <class ElemType>
         }
 #endif
 
-        // function to access any input and output, value and gradient, whole batch or single frame
-        // Note: This returns a reference into 'data' in the form of a column slice, i.e. a small matrix object that just points into 'data'.
+    // function to access any input and output, value and gradient, whole batch or single frame
+    // Note: This returns a reference into 'data' in the form of a column slice, i.e. a small matrix object that just points into 'data'.
     Matrix<ElemType> DataFor(Matrix<ElemType>& data, const FrameRange& fr /*select frame or entire batch*/)
+    {
+        try
         {
-            try
-            {
-                return DataWithMBLayoutFor(data, fr, m_pMBLayout);
-            }
+            return DataWithMBLayoutFor(data, fr, m_pMBLayout);
+        }
         catch (const logic_error& e) // catch the error and rethrow it with the node name attached
-            {
-                LogicError("%s, for %ls %ls operation.", e.what(), NodeName().c_str(), OperationName().c_str());
-            }
+        {
+            LogicError("%s, for %ls %ls operation.", e.what(), NodeName().c_str(), OperationName().c_str());
         }
+    }
 
     Matrix<ElemType> ValueFor(const FrameRange& fr /*select frame or entire batch*/)
-        {
-            return DataFor(Value(), fr);
-        }
+    {
+        return DataFor(Value(), fr);
+    }
     Matrix<ElemType> GradientFor(const FrameRange& fr /*select frame or entire batch*/)
-        {
-            return DataFor(Gradient(), fr);
-        }
-        // use the following two versions if you assume the inputs may contain gaps that must be set to zero because you want to reduce over frames with a BLAS operation
+    {
+        return DataFor(Gradient(), fr);
+    }
+    // use the following two versions if you assume the inputs may contain gaps that must be set to zero because you want to reduce over frames with a BLAS operation
     Matrix<ElemType> MaskedValueFor(const FrameRange& fr /*select frame or entire batch*/)
-        {
-            MaskMissingValueColumnsToZero(fr);
-            return ValueFor(fr);
-        }
+    {
+        MaskMissingValueColumnsToZero(fr);
+        return ValueFor(fr);
+    }
     Matrix<ElemType> MaskedGradientFor(const FrameRange& fr /*select frame or entire batch*/)
-        {
-            MaskMissingGradientColumnsToZero(fr);
-            return GradientFor(fr);
-        }
-        // tensor version of the above functions
+    {
+        MaskMissingGradientColumnsToZero(fr);
+        return GradientFor(fr);
+    }
+    // tensor version of the above functions
     TensorView<ElemType> DataTensorFor(Matrix<ElemType>& data, size_t rank, const FrameRange& fr)
+    {
+        try
         {
-            try
-            {
-                return TensorView<ElemType>(data, GetTensorSliceFor(rank, fr));
-            }
-        catch (const logic_error& e) // catch the error and rethrow it with the node name attached
-            {
-                LogicError("%s, for %ls %ls operation.", e.what(), NodeName().c_str(), OperationName().c_str());
-            }
+            return TensorView<ElemType>(data, GetTensorSliceFor(rank, fr));
         }
-    TensorView<ElemType> ValueTensorFor(size_t rank, const FrameRange& fr)
+        catch (const logic_error& e) // catch the error and rethrow it with the node name attached
         {
-            return DataTensorFor(Value(), rank, fr);
+            LogicError("%s, for %ls %ls operation.", e.what(), NodeName().c_str(), OperationName().c_str());
         }
+    }
+    TensorView<ElemType> ValueTensorFor(size_t rank, const FrameRange& fr)
+    {
+        return DataTensorFor(Value(), rank, fr);
+    }
     TensorView<ElemType> GradientTensorFor(size_t rank, const FrameRange& fr)
-        {
-            return DataTensorFor(Gradient(), rank, fr);
-        }
-
-    private:
-
-        // determine the size that we should set our Matrix storage to
-        void DetermineDataSize(size_t & rows, size_t & cols) const
-        {
-            if (HasMBLayout())
-            {
-                rows = GetSampleMatrixNumRows();
-                cols = GetSampleMatrixNumCols();
-            }
-            else
-            {
-                const auto & shape = GetSampleLayout();
-                rows = shape.GetRank() > 0 ? shape[0] : 0;
-                cols = rows > 0 ? shape.GetNumElements() / rows : 0;
-            }
-        }
-
-    protected:
+    {
+        return DataTensorFor(Gradient(), rank, fr);
+    }
 
-        // set the size of the underlying Matrix object to match node dimensions
-        void UpdateDataSize(Matrix<ElemType>& m)
+private:
+    // determine the size that we should set our Matrix storage to
+    void DetermineDataSize(size_t& rows, size_t& cols) const
+    {
+        if (HasMBLayout())
         {
-            size_t rows, cols;
-            DetermineDataSize(rows, cols);
-            m.Resize(rows, cols);
+            rows = GetSampleMatrixNumRows();
+            cols = GetSampleMatrixNumCols();
         }
-        // and verify the condition that UpdateDataSize() creates (used for sanity checking after loading parameters)
-        void VerifyDataSize(Matrix<ElemType>& m)
+        else
         {
-            size_t rows, cols;
-            DetermineDataSize(rows, cols);
-            m.VerifySize(rows, cols);
+            const auto& shape = GetSampleLayout();
+            rows = shape.GetRank() > 0 ? shape[0] : 0;
+            cols = rows > 0 ? shape.GetNumElements() / rows : 0;
         }
+    }
 
-    public:
+protected:
+    // set the size of the underlying Matrix object to match node dimensions
+    void UpdateDataSize(Matrix<ElemType>& m)
+    {
+        size_t rows, cols;
+        DetermineDataSize(rows, cols);
+        m.Resize(rows, cols);
+    }
+    // and verify the condition that UpdateDataSize() creates (used for sanity checking after loading parameters)
+    void VerifyDataSize(Matrix<ElemType>& m)
+    {
+        size_t rows, cols;
+        DetermineDataSize(rows, cols);
+        m.VerifySize(rows, cols);
+    }
 
-        // update the actual matrix allocation for m_value based on the node dimension
-        void UpdateFunctionValuesSize()
-        {
-            UpdateDataSize(Value());
-        }
+public:
+    // update the actual matrix allocation for m_value based on the node dimension
+    void UpdateFunctionValuesSize()
+    {
+        UpdateDataSize(Value());
+    }
 
-        // this is called before a node's ForwardProp() function is called (in loops: for the first time)
-        // This is where we
-        //  - update the node dimension based on actual MB size
-        //  - (re-)allocate the m_value matrix, which may be shared across nodes and thus have changed dimensions
-        virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
-        {
-            Base::BeginForwardProp();
+    // this is called before a node's ForwardProp() function is called (in loops: for the first time)
+    // This is where we
+    //  - update the node dimension based on actual MB size
+    //  - (re-)allocate the m_value matrix, which may be shared across nodes and thus have changed dimensions
+    virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
+    {
+        Base::BeginForwardProp();
 
-            // update the actual m_value allocation
-            if (!IsLeaf() && !RequiresPreCompute()) // TODO: guard this through overrides instead
-                UpdateFunctionValuesSize();
+        // update the actual m_value allocation
+        if (!IsLeaf() && !RequiresPreCompute()) // TODO: guard this through overrides instead
+            UpdateFunctionValuesSize();
 
-            // give nodes a chance to update their internal state that may also have to match MB size
-            UpdateFunctionMBSize();
+        // give nodes a chance to update their internal state that may also have to match MB size
+        UpdateFunctionMBSize();
 
-            // and make sure dimensions are what we expect
-            VerifyDataSize(Value());
-        }
+        // and make sure dimensions are what we expect
+        VerifyDataSize(Value());
+    }
 
 #ifdef _DEBUG
-        // NaN checks
+    // NaN checks
     virtual void /*IComputationNode::*/ EndForwardProp() override
-        {
-            Base::EndForwardProp();
+    {
+        Base::EndForwardProp();
 #ifdef TRACK_GAP_NANS
         MaskMissingValueColumnsToZero(FrameRange(m_pMBLayout)); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
-            if (Value().HasNan("EndForwardProp"))
-                LogicError("%ls %ls operation unexpectedly produced NaN values.", NodeName().c_str(), OperationName().c_str());
+        if (Value().HasNan("EndForwardProp"))
+            LogicError("%ls %ls operation unexpectedly produced NaN values.", NodeName().c_str(), OperationName().c_str());
 #endif
 #if 0
             MaskMissingValueColumnsToZero(FrameRange(m_pMBLayout)); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
             Value().Print(msra::strfun::utf8(NodeName()), 0, min(Value().GetNumRows()-1, 4), 0, min(Value().GetNumCols()-1, 4));
 #endif
         InvalidateMissingValueColumns(FrameRange(m_pMBLayout)); // blast NaNs into columns that are gaps in a packed layout
-        }
+    }
 #endif
 
 #if 0 // (keep it around in case we need to add stuff in the future)
@@ -1470,189 +1481,189 @@ template <class ElemType>
 
 #ifdef _DEBUG
     virtual void /*IComputationNode::*/ EndBackprop() override
-        {
-            Base::EndBackprop();
+    {
+        Base::EndBackprop();
 #ifdef TRACK_GAP_NANS
-            for (size_t i = 0; i < m_inputs.size(); i++)
+        for (size_t i = 0; i < m_inputs.size(); i++)
+        {
+            ComputationNodePtr child = Input(i);
+            if (child->m_needsGradient)
             {
-                ComputationNodePtr child = Input(i);
-                if (child->m_needsGradient)
-                {
                 child->MaskMissingGradientColumnsToZero(FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
-                    if (child->Gradient().HasNan("EndBackprop"))
-                        LogicError("%ls %ls operation unexpectedly produced NaN gradients.", child->NodeName().c_str(), child->OperationName().c_str());
-                }
+                if (child->Gradient().HasNan("EndBackprop"))
+                    LogicError("%ls %ls operation unexpectedly produced NaN gradients.", child->NodeName().c_str(), child->OperationName().c_str());
             }
-#endif
         }
 #endif
+    }
+#endif
 
-        // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
-        // TODO: move to -Base (or -Network?)
+    // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
+    // TODO: move to -Base (or -Network?)
     void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override
-        {
-            if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
-                LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
+    {
+        if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
+            LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
 
-            for (size_t i = 0; i < m_inputs.size(); i++)
-            {
-                ComputationNodePtr child = Input(i);
-                if (child->m_needsGradient &&
+        for (size_t i = 0; i < m_inputs.size(); i++)
+        {
+            ComputationNodePtr child = Input(i);
+            if (child->m_needsGradient &&
                 (childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop() ||
                  childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()))
-                {
-                    //fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
-                    if (!m_needsGradient)
-                        LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
+            {
+                //fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
+                if (!m_needsGradient)
+                    LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
 #ifdef DISPLAY_DEBUG
                 fprintf(stderr, "    [%lu]: %ls(%ls)\n", i, child->OperationName().c_str(), child->NodeName().c_str());
 #endif
 #if DUMPOUTPUT
-                    fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
+                fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
 #endif
                 child->LazyZeroGradient(); // set gradient to 0 if this is the first time
 
-                    // If we propagate from a loop to a node that is outside the loop, we are not efficient.
-                    // This case is handled by SEQTraversalFlowControlNode::Backprop().
-                    // The check below is to verify that.
-                    if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames())
-                    {
-                        LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
-                                   NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
-                    }
+                // If we propagate from a loop to a node that is outside the loop, we are not efficient.
+                // This case is handled by SEQTraversalFlowControlNode::Backprop().
+                // The check below is to verify that.
+                if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames())
+                {
+                    LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
+                               NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
+                }
 
-                    //fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
+                //fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
                 BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
-                }
+            }
 #ifdef DISPLAY_DEBUG
             else
                 fprintf(stderr, "    [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, child->OperationName().c_str(), child->NodeName().c_str());
 #endif
-            }
         }
+    }
 
-        // TODO: why of the inputs, and not the node itself?
+    // TODO: why of the inputs, and not the node itself?
     void /*ComputationNodeBase::*/ ZeroGradientsOfInputs() override // clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily)
-        {
-            for (size_t i = 0; i < m_inputs.size(); i++)
-                Input(i)->m_gradientInitialized = false;
-        }
+    {
+        for (size_t i = 0; i < m_inputs.size(); i++)
+            Input(i)->m_gradientInitialized = false;
+    }
 
-        // lazy resetting of gradient
-        void LazyZeroGradient()
-        {
-            if (!m_needsGradient)
-                LogicError("%ls %ls operation: LazyZeroGradient() called although this node needs no gradient.", NodeName().c_str(), OperationName().c_str());
+    // lazy resetting of gradient
+    void LazyZeroGradient()
+    {
+        if (!m_needsGradient)
+            LogicError("%ls %ls operation: LazyZeroGradient() called although this node needs no gradient.", NodeName().c_str(), OperationName().c_str());
 
-            if (m_gradientInitialized)
-                return;
+        if (m_gradientInitialized)
+            return;
 
-            UpdateDataSize(Gradient());
-            Gradient().SetValue(0);
+        UpdateDataSize(Gradient());
+        Gradient().SetValue(0);
 
-            m_gradientInitialized = true;
-        }
+        m_gradientInitialized = true;
+    }
 
-        // NOTE: we should reimplement this to be thread-safe and use a larger than requested initialized memory block
-        // we can then just wrap that memory block in a matrix of the correct dimensions since it will be const no one can change it
-        // should only need one memory block per device
-        // Thread-safety could be achieved by changing this to a shared_ptr.
-        // When using the TensorView interface, one could instead just use a 1x1 matrix with a view that broadcasts its columns (stride 0).
-        static const Matrix<ElemType>& ConstOnes(const size_t rows, const size_t cols, const DEVICEID_TYPE deviceId)
+    // NOTE: we should reimplement this to be thread-safe and use a larger than requested initialized memory block
+    // we can then just wrap that memory block in a matrix of the correct dimensions since it will be const no one can change it
+    // should only need one memory block per device
+    // Thread-safety could be achieved by changing this to a shared_ptr.
+    // When using the TensorView interface, one could instead just use a 1x1 matrix with a view that broadcasts its columns (stride 0).
+    static const Matrix<ElemType>& ConstOnes(const size_t rows, const size_t cols, const DEVICEID_TYPE deviceId)
+    {
+        if (s_constOnes.find(rows) == s_constOnes.end() ||
+            s_constOnes[rows].find(cols) == s_constOnes[rows].end()) //not found
         {
-            if (s_constOnes.find(rows) == s_constOnes.end() ||
-                s_constOnes[rows].find(cols) == s_constOnes[rows].end()) //not found
-            {
             Matrix<ElemType>* matrix = new Matrix<ElemType>(rows, cols, (DEVICEID_TYPE) deviceId);
-                matrix->SetValue(1);
-                s_constOnes[rows][cols] = matrix;
-            }
+            matrix->SetValue(1);
+            s_constOnes[rows][cols] = matrix;
+        }
 
-            Matrix<ElemType>* m = s_constOnes[rows][cols];
-            m->TransferFromDeviceToDevice(m->GetDeviceId(), deviceId);
+        Matrix<ElemType>* m = s_constOnes[rows][cols];
+        m->TransferFromDeviceToDevice(m->GetDeviceId(), deviceId);
 
-            return *m;
-        }
+        return *m;
+    }
 
-        void CreateGradientMatrixIfNull()
-        {
-            CreateMatrixIfNull(m_gradient);
-        }
+    void CreateGradientMatrixIfNull()
+    {
+        CreateMatrixIfNull(m_gradient);
+    }
 
-        void MarkValueNonSharable() override
-        {
-            m_valueSharable = false; 
-            CreateMatrixIfNull(m_value);
-        }
+    void MarkValueNonSharable() override
+    {
+        m_valueSharable = false;
+        CreateMatrixIfNull(m_value);
+    }
 
-    protected:
-        // this function is used to create matrices for those needed before matrix pool is available
-        // e.g., for model parameters and input nodes you will need to resize the functions based on NDL
-        // and before matrix pool is available
-        void CreateMatrixIfNull(shared_ptr<Matrix<ElemType>>& matrixPtr)
-        {
-            if (!matrixPtr)
-                matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
-        }
+protected:
+    // this function is used to create matrices for those needed before matrix pool is available
+    // e.g., for model parameters and input nodes you will need to resize the functions based on NDL
+    // and before matrix pool is available
+    void CreateMatrixIfNull(shared_ptr<Matrix<ElemType>>& matrixPtr)
+    {
+        if (!matrixPtr)
+            matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
+    }
 
-        void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
+    void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
+    {
+        if (matrixPtr == nullptr)
         {
-            if (matrixPtr == nullptr)
-            {
-                matrixPtr = matrixPool.Request<ElemType>(m_deviceId);
-            }
+            matrixPtr = matrixPool.Request<ElemType>(m_deviceId);
         }
+    }
 
-        void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
-        {
-            assert(matrixPtr != nullptr);
-            matrixPool.Release<ElemType>(matrixPtr);
-        }
+    void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
+    {
+        assert(matrixPtr != nullptr);
+        matrixPool.Release<ElemType>(matrixPtr);
+    }
 
-        // print node values
-        void PrintNodeValuesToFile(const bool printValues, File& fstream) const
+    // print node values
+    void PrintNodeValuesToFile(const bool printValues, File& fstream) const
+    {
+        if (printValues)
         {
-            if (printValues)
-            {
-                fstream << wstring(L"\n");
+            fstream << wstring(L"\n");
             const Matrix<ElemType>& m = Value();
             for (size_t i = 0; i < m.GetNumRows(); i++)
-                {
+            {
                 for (size_t j = 0; j < m.GetNumCols(); j++)
-                    {
+                {
                     fstream << m(i, j);
-                    }
-                    fstream << wstring(L"\n");
                 }
-                fstream << wstring(L"####################################################################");
+                fstream << wstring(L"\n");
             }
+            fstream << wstring(L"####################################################################");
         }
+    }
 
-    public:
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+public:
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = UpCast(nodeP);
-                *node->m_value = *m_value;
-                if (m_gradient)
-                    *node->m_gradient = *m_gradient;
-                else
-                    node->m_gradient = nullptr;
-            }
+            auto node = UpCast(nodeP);
+            *node->m_value = *m_value;
+            if (m_gradient)
+                *node->m_gradient = *m_gradient;
+            else
+                node->m_gradient = nullptr;
         }
+    }
 
-        // duplicate a node
-        ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags)
-        {
-            const std::wstring& name = (newName == L"") ? NodeName() : newName;
-            ComputationNodeBasePtr node(NewThis(m_deviceId, name)); // NewThis() is a virtual function that creates a new node of the actual type of 'this'
-            node->CopyTo(shared_from_this(), newName, flags);       // note: shared_from_this() is the base class, but CopyTo() up-casts it as needed
-            return node;
-        }
+    // duplicate a node
+    ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags)
+    {
+        const std::wstring& name = (newName == L"") ? NodeName() : newName;
+        ComputationNodeBasePtr node(NewThis(m_deviceId, name)); // NewThis() is a virtual function that creates a new node of the actual type of 'this'
+        node->CopyTo(shared_from_this(), newName, flags);       // note: shared_from_this() is the base class, but CopyTo() up-casts it as needed
+        return node;
+    }
 
-        // these are used to export hidden state activations
+    // these are used to export hidden state activations
     virtual bool GetHistory(Matrix<ElemType>&, bool)
     {
         return false;
@@ -1661,7 +1672,7 @@ template <class ElemType>
     {
     }
 
-        /// these two are used to pass gradients from future minibatch
+    /// these two are used to pass gradients from future minibatch
     virtual void GetErrorsToPreviousMinibatch(Matrix<ElemType>&)
     {
     }
@@ -1669,75 +1680,75 @@ template <class ElemType>
     {
     }
 
-    protected:
-        shared_ptr<Matrix<ElemType>> m_value, m_gradient;
+protected:
+    shared_ptr<Matrix<ElemType>> m_value, m_gradient;
 
-        static std::map<size_t, std::map<size_t, Matrix<ElemType>*>> s_constOnes;
-    };
+    static std::map<size_t, std::map<size_t, Matrix<ElemType>*>> s_constOnes;
+};
 
-    // convenience wrapper for ComputationNode::New()
+// convenience wrapper for ComputationNode::New()
 template <class C, class... _Types>
 inline shared_ptr<C> New(_Types&&... _Args)
-    {
-        return make_shared<C>(forward<_Types>(_Args)...);
-    }
+{
+    return make_shared<C>(forward<_Types>(_Args)...);
+}
 
-    // =======================================================================
-    // ComputationNodeNonLooping -- abstract base class for computation nodes that do not implement eval/partial for individual frames
-    // Such as CRFNode, LSTMNode, ParallelNode, SequenceDecoderNode, TimeReverseNode (BatchModeNode), and TransposeNode.
-    // =======================================================================
+// =======================================================================
+// ComputationNodeNonLooping -- abstract base class for computation nodes that do not implement eval/partial for individual frames
+// Such as CRFNode, LSTMNode, ParallelNode, SequenceDecoderNode, TimeReverseNode (BatchModeNode), and TransposeNode.
+// =======================================================================
 
-    // This will provide default implementations for those two functions that will fail at runtime with a meaningful error.
-    // TODO: Most of these are reduce nodes that output a single number, no MBLayout. Maybe abstract those out further
+// This will provide default implementations for those two functions that will fail at runtime with a meaningful error.
+// TODO: Most of these are reduce nodes that output a single number, no MBLayout. Maybe abstract those out further
 template <class ElemType>
-    class ComputationNodeNonLooping : public ComputationNode<ElemType>
-    {
-        typedef ComputationNode<ElemType> Base;
+class ComputationNodeNonLooping : public ComputationNode<ElemType>
+{
+    typedef ComputationNode<ElemType> Base;
 
-    public:
+public:
     ComputationNodeNonLooping(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        // these two implement the ComputationNode<> interface
+    // these two implement the ComputationNode<> interface
     void ForwardProp(const FrameRange& fr) override final
-        {
-            if (fr.IsAllFrames())
-                ForwardPropNonLooping();
-            else
-                LogicError("%s node should never be in a loop.", typeid(*this).name());
-        }
+    {
+        if (fr.IsAllFrames())
+            ForwardPropNonLooping();
+        else
+            LogicError("%s node should never be in a loop.", typeid(*this).name());
+    }
     void BackpropTo(const size_t inputIndex, const FrameRange& fr) override final
-        {
-            if (fr.IsAllFrames())
-                BackpropToNonLooping(inputIndex);
-            else
-                LogicError("%s node should never be in a loop.", typeid(*this).name());
-        }
+    {
+        if (fr.IsAllFrames())
+            BackpropToNonLooping(inputIndex);
+        else
+            LogicError("%s node should never be in a loop.", typeid(*this).name());
+    }
 
-        // non-looping node types instead implement these functions
-        virtual void ForwardPropNonLooping() = 0;
-        virtual void BackpropToNonLooping(size_t inputIndex) = 0;
-    };
+    // non-looping node types instead implement these functions
+    virtual void ForwardPropNonLooping() = 0;
+    virtual void BackpropToNonLooping(size_t inputIndex) = 0;
+};
 
-    // =======================================================================
-    // FlowControlNode -- special wrapper node for use by ComputationNetwork only
-    // =======================================================================
+// =======================================================================
+// FlowControlNode -- special wrapper node for use by ComputationNetwork only
+// =======================================================================
 
-    class FlowControlNode : public ComputationNodeBase
-    {
-        typedef ComputationNodeBase Base;
+class FlowControlNode : public ComputationNodeBase
+{
+    typedef ComputationNodeBase Base;
 
-    public:
+public:
     FlowControlNode()
         : ComputationNodeBase(DEVICEID_NOTYETDETERMINED /*we don't own matrices*/, L"" /*name: we don't care*/)
     {
     }
 
 #pragma warning(disable : 4100)
-        // these are meant to be implemented by ComputationNode<ElemType> but should never be called on traversal nodes
-        // TODO: There are too many of these. This indicates improper class hierarchies.
+    // these are meant to be implemented by ComputationNode<ElemType> but should never be called on traversal nodes
+    // TODO: There are too many of these. This indicates improper class hierarchies.
     virtual ComputationNodeBase* NewThis(DEVICEID_TYPE deviceId, const wstring& name) override
     {
         NOT_IMPLEMENTED;
@@ -1778,7 +1789,7 @@ template <class ElemType>
     {
         NOT_IMPLEMENTED;
     }
-    virtual void ValidateInferInputDimsFrom(const TensorShape &) override
+    virtual void ValidateInferInputDimsFrom(const TensorShape&) override
     {
         NOT_IMPLEMENTED;
     }
@@ -1806,12 +1817,15 @@ template <class ElemType>
     {
         NOT_IMPLEMENTED;
     }
-    virtual void NotifyFunctionValuesMBSizeModified(void) override { NOT_IMPLEMENTED; }
+    virtual void NotifyFunctionValuesMBSizeModified(void) override
+    {
+        NOT_IMPLEMENTED;
+    }
     virtual std::wstring ToString(void) const override
     {
         NOT_IMPLEMENTED;
     }
-        // these are meant to be called during computation, so provide dummy implementations
+    // these are meant to be called during computation, so provide dummy implementations
     virtual bool RequiresPreCompute() const override
     {
         return false;
@@ -1823,49 +1837,49 @@ template <class ElemType>
     {
     }
 
-    protected:
+protected:
 public:                                                // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode
     std::vector<ComputationNodeBasePtr> m_nestedNodes; // nodes tucked away in this node, in evaluation order
-    };
+};
 
-    // =======================================================================
-    // ILateAttachingNode -- helper wrapper class for ComputationNodes that must AttachInputs() late due to circular references
-    // =======================================================================
+// =======================================================================
+// ILateAttachingNode -- helper wrapper class for ComputationNodes that must AttachInputs() late due to circular references
+// =======================================================================
 
-    // Instantiate with LateAttachingNode<node type>(lambda, args for node constructor).
-    // To resolve, call AttachInputs()
-    // TODO: This is a bit indirect. Can it be done more nicely?
+// Instantiate with LateAttachingNode<node type>(lambda, args for node constructor).
+// To resolve, call AttachInputs()
+// TODO: This is a bit indirect. Can it be done more nicely?
 struct ILateAttachingNode
 {
     virtual void LateAttachInputs() = 0;
 };
 template <class N>
-    class LateAttachingNode : public N, public ILateAttachingNode
-    {
-        typedef typename N::OurElemType ElemType;
-        function<void(ComputationNode<ElemType>*)> attachInputs;
+class LateAttachingNode : public N, public ILateAttachingNode
+{
+    typedef typename N::OurElemType ElemType;
+    function<void(ComputationNode<ElemType>*)> attachInputs;
 
-    public:
-        // constructor
+public:
+    // constructor
     template <class... _Types>
     LateAttachingNode(DEVICEID_TYPE deviceId, const wstring& name, const function<void(ComputationNode<ElemType>*)>& attachInputs, _Types&&... _Args)
         : attachInputs(attachInputs), N(deviceId, name, forward<_Types>(_Args)...)
     {
     }
-        // the one member that does the work
+    // the one member that does the work
     void /*ILateAttachingNode::*/ LateAttachInputs()
-        {
-            attachInputs(dynamic_cast<N*>(this));
+    {
+        attachInputs(dynamic_cast<N*>(this));
         attachInputs = [](ComputationNode<ElemType>*)
         {
             LogicError("LateAttachingNode::AttachInputs: must only be called once");
         };
-        }
-    };
+    }
+};
 
-    // =======================================================================
-    // IRecurrentNode -- helper wrapper class for ComputationNodes that can be recurrent
-    // =======================================================================
+// =======================================================================
+// IRecurrentNode -- helper wrapper class for ComputationNodes that can be recurrent
+// =======================================================================
 
 struct IRecurrentNode
 {
@@ -1893,10 +1907,10 @@ protected:
     using Base::GetDeviceId;                                                                                                                             \
     using Base::SetDims;                                                                                                                                 \
     using Base::SetDims1;                                                                                                                                \
-    using Base::GetSampleMatrixNumRows;                                                                                                                              \
-    using Base::GetSampleMatrixNumCols;                                                                                                                              \
-    using Base::GetAsMatrixNumRows;                                                                                                                              \
-    using Base::GetAsMatrixNumCols;                                                                                                                              \
+    using Base::GetSampleMatrixNumRows;                                                                                                                  \
+    using Base::GetSampleMatrixNumCols;                                                                                                                  \
+    using Base::GetAsMatrixNumRows;                                                                                                                      \
+    using Base::GetAsMatrixNumCols;                                                                                                                      \
     using Base::GetTensorShape;                                                                                                                          \
     using Base::UpdateFunctionValuesSize;                                                                                                                \
     using Base::LoadValue;                                                                                                                               \
@@ -1910,7 +1924,8 @@ protected:
     using Base::InvalidateMissingGradientColumns;                                                                                                        \
     using Base::DataFor;                                                                                                                                 \
     using Base::ValueFor;                                                                                                                                \
-    using Base::GradientAsMatrix; using Base::Gradient;                                                                                                                                \
+    using Base::GradientAsMatrix;                                                                                                                        \
+    using Base::Gradient;                                                                                                                                \
     using Base::GradientFor;                                                                                                                             \
     using Base::MaskedValueFor;                                                                                                                          \
     using Base::MaskedGradientFor;                                                                                                                       \
@@ -1931,7 +1946,8 @@ protected:
     using Base::CreateUniqId;                                                                                                                            \
     using Base::GetNumInputs;                                                                                                                            \
     using Base::ZeroGradientsOfInputs;                                                                                                                   \
-    using Base::VerifyDims; using Base::VerifyDataSize;                                                                                                                              \
+    using Base::VerifyDims;                                                                                                                              \
+    using Base::VerifyDataSize;                                                                                                                          \
     using Base::ConstOnes;                                                                                                                               \
     using Base::DetermineElementwiseTensorRank;                                                                                                          \
     using Base::GetSampleLayout;                                                                                                                         \
@@ -1962,21 +1978,23 @@ protected:
     using Base::RequestMatricesBeforeBackprop;                                                                                                           \
     using Base::ReleaseMatricesAfterBackprop;                                                                                                            \
     using Base::InputUsedInComputingInputNodesGradients;                                                                                                 \
-    using Base::OutputUsedInComputingInputNodesGradients; using Base::m_valueSharable;                                                                                              \
+    using Base::OutputUsedInComputingInputNodesGradients;                                                                                                \
+    using Base::m_valueSharable;                                                                                                                         \
     using Base::Validate;                                                                                                                                \
     using Base::ValidateUnaryMap;                                                                                                                        \
     using Base::ValidateBinaryZip;                                                                                                                       \
     using Base::ValidateUnaryReduce;                                                                                                                     \
     using Base::ValidateBinaryReduce;                                                                                                                    \
     using Base::ValidateInferBinaryInputDims;                                                                                                            \
-    using Base::ValidateInferInputDimsFrom;                                                                                                                  \
+    using Base::ValidateInferInputDimsFrom;                                                                                                              \
     \
 public:                                                                                                                                                  \
     using Base::RequiresPreCompute;                                                                                                                      \
     using Base::AttachInputs;                                                                                                                            \
     using Base::CreateGradientMatrixIfNull;                                                                                                              \
     using Base::NodeName;                                                                                                                                \
-    using Base::ValueAsMatrix; using Base::Value;
+    using Base::ValueAsMatrix;                                                                                                                           \
+    using Base::Value;
 
 #define ComputationNodeBoilerplate                                                             \
     \
@@ -1994,89 +2012,89 @@ public:
     ComputationNodeBoilerplate;                \
     UsingComputationNodeMembers
 
-    // =======================================================================
-    // a few standard base classes for N-nary operations
-    // =======================================================================
+// =======================================================================
+// a few standard base classes for N-nary operations
+// =======================================================================
 
-    // -----------------------------------------------------------------------
-    // UnaryElementWiseNode (operand)
-    //
-    // unary elementwise operations that are implemented with the tensor lib
-    //
-    // Derived clases only need to override ForwardProp() and BackpropTo().
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// UnaryElementWiseNode (operand)
+//
+// unary elementwise operations that are implemented with the tensor lib
+//
+// Derived clases only need to override ForwardProp() and BackpropTo().
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class UnaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<1>
-    {
+class UnaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<1>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembers;
 
-    public:
+public:
     UnaryElementWiseNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            ValidateUnaryMap(isFinalValidationPass);
-        }
-    };
+    {
+        ValidateUnaryMap(isFinalValidationPass);
+    }
+};
 
 #define UsingUnaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
 
-    // -----------------------------------------------------------------------
-    // BinaryElementWiseNode (operand1, operand2)
-    //
-    // binary elementwise operations that are implemented with the tensor lib
-    //
-    // Derived clases only need to override ForwardProp() and BackpropTo().
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// BinaryElementWiseNode (operand1, operand2)
+//
+// binary elementwise operations that are implemented with the tensor lib
+//
+// Derived clases only need to override ForwardProp() and BackpropTo().
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class BinaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
+class BinaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs<2>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembers;
 
-    public:
+public:
     BinaryElementWiseNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
 #if DUMPOUTPUT
-            return true;
+        return true;
 #else
-            // By default, the BinaryElementWiseNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
+        // By default, the BinaryElementWiseNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
 #endif
-        }
+    }
 
-        // By default, the BinaryElementWiseNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
+    // By default, the BinaryElementWiseNode does not require any of it's input's values for computing
+    // the gradients of its input nodes
     virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
     {
         return false;
     }
 
     virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
-        {
-            Base::BeginForwardProp();
-            // we switch result to dense as a work-around because ColumnSlice doesn't support all the sparse formats
-            // TODO: This is a stopgap. Is this the right thing to do? It changes the matrix type in-place.
-            Value().SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
-        }
+    {
+        Base::BeginForwardProp();
+        // we switch result to dense as a work-around because ColumnSlice doesn't support all the sparse formats
+        // TODO: This is a stopgap. Is this the right thing to do? It changes the matrix type in-place.
+        Value().SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
+    {
         ValidateBinaryZip(isFinalValidationPass, true /*allowMultiples*/);
-        }
-    };
+    }
+};
 
 #define UsingBinaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
 
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index d7822062f5fe..9028e94f689f 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -159,12 +159,12 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
         assert(m_convEng != nullptr);
         if (inputIndex == 0) // derivative with respect to the weight matrix
         {
-            auto & grad = Input(0)->GradientAsMatrix();
+            auto& grad = Input(0)->GradientAsMatrix();
             m_convEng->BackwardFilter(*m_outT, sliceOutputGrad, *m_inT, sliceInput1Value, *m_convDesc, *m_filterT, grad, fr.IsAllFrames(), *m_tempMatrix);
         }
         else if (inputIndex == 1) // derivative with respect to the input feature
         {
-            auto & input0 = Input(0)->ValueAsMatrix();
+            auto& input0 = Input(0)->ValueAsMatrix();
             auto sliceInput1Grad = Input(1)->GradientFor(fr);
             m_convEng->BackwardData(*m_outT, sliceOutputGrad, *m_filterT, input0, *m_convDesc, *m_inT, sliceInput1Grad, *m_tempMatrix);
         }
@@ -507,7 +507,7 @@ class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
 // See #define of 'UsingComputationNodeMembersBoilerplate' for more explanation.
 #define UsingPoolingNodeBaseMembers         \
     UsingComputationNodeMembersBoilerplate; \
-                                            \
+    \
 protected:                                  \
     using Base::m_factory;                  \
     using Base::m_poolDesc;                 \
@@ -517,7 +517,7 @@ protected:                                  \
     using Base::m_verticalSubsample;        \
     using Base::m_inputSizePerSample;       \
     using Base::m_outputSizePerSample;      \
-                                            \
+    \
 public:
 
 // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h
index 1ca60d180767..1a57b93d9855 100644
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@@ -14,17 +14,17 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    // This header collects special-purpose nodes.
-    // It is likely that these are no longer functional.
+// This header collects special-purpose nodes.
+// It is likely that these are no longer functional.
 
 #ifndef ENABLE_BROADCASTING_ELEMENTTIMES
-    // -----------------------------------------------------------------------
-    // PlusNode (summand1, summand2)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// PlusNode (summand1, summand2)
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class PlusNode : public BinaryElementWiseNode<ElemType>
-    {
+class PlusNode : public BinaryElementWiseNode<ElemType>
+{
     typedef BinaryElementWiseNode<ElemType> Base;
     UsingBinaryElementwiseNodeBaseMembers;
     static const std::wstring TypeName()
@@ -32,154 +32,154 @@ template <class ElemType>
         return L"Plus";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(PlusNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(PlusNode);
     PlusNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
-            Matrix<ElemType> gradientValues = GradientFor(fr);
-            Matrix<ElemType> functionValues = ValueFor(fr);
-            Matrix<ElemType> inputGradientValues = Input(inputIndex)->GradientFor(fr.AllowBroadcast());
+    {
+        Matrix<ElemType> gradientValues = GradientFor(fr);
+        Matrix<ElemType> functionValues = ValueFor(fr);
+        Matrix<ElemType> inputGradientValues = Input(inputIndex)->GradientFor(fr.AllowBroadcast());
 
 #if DUMPOUTPUT
-            functionValues.Print("PlusNode");
+        functionValues.Print("PlusNode");
 #endif
-            size_t rowsc = Input(inputIndex)->GetNumRows(), colsc = Input(inputIndex)->GetNumColsFor(fr.AllowBroadcast());
-            size_t rowsp = this->GetNumRows(), colsp = this->GetNumColsFor(fr);
+        size_t rowsc = Input(inputIndex)->GetNumRows(), colsc = Input(inputIndex)->GetNumColsFor(fr.AllowBroadcast());
+        size_t rowsp = this->GetNumRows(), colsp = this->GetNumColsFor(fr);
 #if DUMPOUTPUT
-            fprintf(stderr, "input dimensions %lld x %lld,  this node dimensions %lld x %lld\n", rowsc, colsc, rowsp, colsp);
-            gradientValues.Print("Gradient-in");
-            inputGradientValues.Print("child Gradient-in/out");
+        fprintf(stderr, "input dimensions %lld x %lld,  this node dimensions %lld x %lld\n", rowsc, colsc, rowsp, colsp);
+        gradientValues.Print("Gradient-in");
+        inputGradientValues.Print("child Gradient-in/out");
 #endif
 
         if (colsc == colsp && rowsc == rowsp) // matching dimensions  --this may also trigger for column vector added to a frame, if fr denotes a single frame
-            {
-                // BUGBUG: if we reduce from a frame of a MB into a one-column vector, then we must also mask gaps
-                inputGradientValues += gradientValues;
-            }
+        {
+            // BUGBUG: if we reduce from a frame of a MB into a one-column vector, then we must also mask gaps
+            inputGradientValues += gradientValues;
+        }
         else if (colsc == 1 && rowsc == 1) // child is a scalar
-            {
+        {
             MaskMissingGradientColumnsToZero(fr); // reducing over frames, so we must zero out the gaps
-                inputGradientValues += gradientValues.SumOfElements();
-            }
+            inputGradientValues += gradientValues.SumOfElements();
+        }
         else if (colsc == 1 && colsp != 1) // child is a broadcasting column vector
-            {
+        {
             MaskMissingGradientColumnsToZero(fr); // reducing over frames, so we must zero out the gaps
-                // Special case for convolution node bias. See comment in EvaluateThisNode for more details.
-                // BUGBUG: This is not composable. For example, MinusNode does not allow this.
-                auto convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[0]);
-                if (convNode != nullptr || (convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[1])) != nullptr)
-                    convNode->BackwardBias(gradientValues, inputGradientValues);
-                else
-                {
+            // Special case for convolution node bias. See comment in EvaluateThisNode for more details.
+            // BUGBUG: This is not composable. For example, MinusNode does not allow this.
+            auto convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[0]);
+            if (convNode != nullptr || (convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[1])) != nullptr)
+                convNode->BackwardBias(gradientValues, inputGradientValues);
+            else
+            {
                 size_t colspExpand = rowsp * colsp / rowsc;
-                    Matrix<ElemType>::MultiplyAndAdd(gradientValues.Reshaped(rowsc, colspExpand), false, ConstOnes(colspExpand, 1, functionValues.GetDeviceId()), false, inputGradientValues);
-                }
+                Matrix<ElemType>::MultiplyAndAdd(gradientValues.Reshaped(rowsc, colspExpand), false, ConstOnes(colspExpand, 1, functionValues.GetDeviceId()), false, inputGradientValues);
             }
+        }
         else if (rowsc == 1 && rowsp != 1) // child is a broadcasting row vector
+        {
+            Matrix<ElemType>::MultiplyAndAdd(ConstOnes(1, rowsp, functionValues.GetDeviceId()), false, gradientValues, false, inputGradientValues);
+        }
+        else if (colsc != 1 && colsp % colsc == 0)
+        {
+            // the children matrix is [a b] and the parent considers it as [a a a b b b]
+            // Note: There is no need to mask gaps here because this operation is only allowed on non-MBLayout inputs
+            size_t ratio = colsp / colsc;
+            for (size_t i = 0; i < colsc; i++)
             {
-                Matrix<ElemType>::MultiplyAndAdd(ConstOnes(1, rowsp, functionValues.GetDeviceId()), false, gradientValues, false, inputGradientValues);
-            }
-            else if (colsc != 1 && colsp % colsc == 0)
-            {
-                // the children matrix is [a b] and the parent considers it as [a a a b b b]
-                // Note: There is no need to mask gaps here because this operation is only allowed on non-MBLayout inputs
-                size_t ratio = colsp / colsc; 
-                for (size_t i = 0; i < colsc; i++)
-                {
                 size_t colspExpand = rowsp * colsp / rowsc / colsc;
-                    Matrix<ElemType> tmp = gradientValues.ColumnSlice(i * ratio, ratio);
-                    tmp.Reshape(rowsc, colspExpand);
-                    Matrix<ElemType> res = inputGradientValues.ColumnSlice(i, 1);
-                    Matrix<ElemType>::MultiplyAndAdd(tmp, false, ConstOnes(colspExpand, 1, functionValues.GetDeviceId()), false, res);
-                    inputGradientValues.ColumnSlice(i, 1).SetValue(res);
-                }
+                Matrix<ElemType> tmp = gradientValues.ColumnSlice(i * ratio, ratio);
+                tmp.Reshape(rowsc, colspExpand);
+                Matrix<ElemType> res = inputGradientValues.ColumnSlice(i, 1);
+                Matrix<ElemType>::MultiplyAndAdd(tmp, false, ConstOnes(colspExpand, 1, functionValues.GetDeviceId()), false, res);
+                inputGradientValues.ColumnSlice(i, 1).SetValue(res);
             }
-            else
-                RuntimeError("Plus partial: unexpected condition.");
+        }
+        else
+            RuntimeError("Plus partial: unexpected condition.");
 #if DUMPOUTPUT
-            inputGradientValues.Print("child Gradient-out");
+        inputGradientValues.Print("child Gradient-out");
 #endif
-        }
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            Matrix<ElemType> functionValues = ValueFor(fr);
-            Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
-            Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
-            // Note: If one input is a column vector (no MBLayout) and the other a sequence of frames (MBLayout), then the above will be a slice for the other only.
+    {
+        Matrix<ElemType> functionValues = ValueFor(fr);
+        Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
+        Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
+        // Note: If one input is a column vector (no MBLayout) and the other a sequence of frames (MBLayout), then the above will be a slice for the other only.
 
-            size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
-            size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
+        size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
+        size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
 
         if ((rows0 == rows1 && cols0 == cols1 /*matching dimensions*/) || ((rows0 == 1 || rows1 == 1) /*one is a broadcasting row vector*/ && cols0 == cols1))
+        {
+            functionValues.AssignSumOf(inputFunctionValues0, inputFunctionValues1);
+        }
+        else if (cols0 == 1 && rows1 % rows0 == 0 || cols1 == 1 && rows0 % rows1 == 0) // one is col vec with divisable rows, including scalar   --allowing divisable rows can be useful for images
+        {
+            // REVIEW alexeyk: this hack is required to handle bias in convolution node which may
+            // use a format (e.g. NCHW) where bias addition cannot be represented as adding column/row vector to matrix.
+            // Bias does NOT have to be a vector of size equal to number of output feature map (though it's a common case).
+            auto convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[0]);
+            if (convNode != nullptr || (convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[1])) != nullptr)
             {
-                functionValues.AssignSumOf(inputFunctionValues0, inputFunctionValues1);
+                convNode->AddBias(cols0 == 1 ? inputFunctionValues1 : inputFunctionValues0,
+                                  cols0 == 1 ? inputFunctionValues0 : inputFunctionValues1, functionValues);
             }
-            else if (cols0 == 1 && rows1 % rows0 == 0 || cols1 == 1 && rows0 % rows1 == 0) // one is col vec with divisable rows, including scalar   --allowing divisable rows can be useful for images
+            else
             {
-                // REVIEW alexeyk: this hack is required to handle bias in convolution node which may
-                // use a format (e.g. NCHW) where bias addition cannot be represented as adding column/row vector to matrix.
-                // Bias does NOT have to be a vector of size equal to number of output feature map (though it's a common case).
-                auto convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[0]);
-                if (convNode != nullptr || (convNode = dynamic_pointer_cast<ConvolutionNode<ElemType>>(m_inputs[1])) != nullptr)
+                // None of the input nodes are convolutional.
+                if (cols0 == 1)
                 {
-                    convNode->AddBias(cols0 == 1 ? inputFunctionValues1 : inputFunctionValues0, 
-                        cols0 == 1 ? inputFunctionValues0 : inputFunctionValues1, functionValues);
+                    functionValues.Reshape(rows0, rows1 * cols1 / rows0);
+                    functionValues.AssignSumOf(inputFunctionValues1.Reshaped(rows0, rows1 * cols1 / rows0), inputFunctionValues0);
                 }
                 else
                 {
-                    // None of the input nodes are convolutional.
-                    if (cols0 == 1)
-                    {
-                        functionValues.Reshape(rows0, rows1 * cols1 / rows0);
-                        functionValues.AssignSumOf(inputFunctionValues1.Reshaped(rows0, rows1 * cols1 / rows0), inputFunctionValues0);
-                    }
-                    else
-                    {
-                        functionValues.Reshape(rows1, rows0 * cols0 / rows1);
-                        functionValues.AssignSumOf(inputFunctionValues0.Reshaped(rows1, rows0 * cols0 / rows1), inputFunctionValues1);
-                    }
+                    functionValues.Reshape(rows1, rows0 * cols0 / rows1);
+                    functionValues.AssignSumOf(inputFunctionValues0.Reshaped(rows1, rows0 * cols0 / rows1), inputFunctionValues1);
                 }
-                functionValues.Reshape(max(rows0, rows1), max(cols0, cols1));
             }
+            functionValues.Reshape(max(rows0, rows1), max(cols0, cols1));
+        }
         else if (cols1 < cols0 && rows0 == rows1 && cols0 % cols1 == 0) // first summand is a matrix with number of columns that is a multiple of the column number of the second matrix
+        {
+            if (m_pMBLayout)
+                InvalidArgument("%ls %ls operation applied to mismatching number of columns when columns are samples of a minibatch", NodeName().c_str(), OperationName().c_str());
+            // the children matrix is [a b] and the parent considers it as [a a a b b b]
+            // This can be useful for dealing with images.
+            Matrix<ElemType> tmpMat(inputFunctionValues1.GetDeviceId());
+            size_t ratio = cols0 / cols1;
+            // TODO: Why is this different from MinusNode?
+            for (size_t i = 0; i < cols1; i++)
             {
-                if (m_pMBLayout)
-                    InvalidArgument("%ls %ls operation applied to mismatching number of columns when columns are samples of a minibatch", NodeName().c_str(), OperationName().c_str());
-                // the children matrix is [a b] and the parent considers it as [a a a b b b]
-                // This can be useful for dealing with images.
-                Matrix<ElemType> tmpMat(inputFunctionValues1.GetDeviceId());
-                size_t ratio = cols0 / cols1;
-                // TODO: Why is this different from MinusNode?
-                for (size_t i = 0; i < cols1; i++)
-                {
-                    tmpMat = Matrix<ElemType>::RepMat(inputFunctionValues1.ColumnSlice(i, 1), 1, ratio);
+                tmpMat = Matrix<ElemType>::RepMat(inputFunctionValues1.ColumnSlice(i, 1), 1, ratio);
                 functionValues.ColumnSlice(i * ratio, ratio).SetValue(tmpMat + inputFunctionValues0.ColumnSlice(i * ratio, ratio));
-                }
             }
-            else
-                LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+        }
+        else
+            LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
 #if DUMPOUTPUT
-            functionValues.Print("PlusNode");
+        functionValues.Print("PlusNode");
 #endif
-        }
-    };
+    }
+};
 
-    template class PlusNode<float>; 
-    template class PlusNode<double>;
+template class PlusNode<float>;
+template class PlusNode<double>;
 
-    // -----------------------------------------------------------------------
-    // MinusNode (minuend, subtrahend)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// MinusNode (minuend, subtrahend)
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class MinusNode : public BinaryElementWiseNode<ElemType>
-    {
+class MinusNode : public BinaryElementWiseNode<ElemType>
+{
     typedef BinaryElementWiseNode<ElemType> Base;
     UsingBinaryElementwiseNodeBaseMembers;
     static const std::wstring TypeName()
@@ -187,94 +187,94 @@ template <class ElemType>
         return L"Minus";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(MinusNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(MinusNode);
     MinusNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
-            ElemType sign = inputIndex == 0 ? 1.0f : -1.0f;
-            Matrix<ElemType> gradientValues = GradientFor(fr);
+    {
+        ElemType sign = inputIndex == 0 ? 1.0f : -1.0f;
+        Matrix<ElemType> gradientValues = GradientFor(fr);
 
-            Matrix<ElemType> childGradientValues = Input(inputIndex)->GradientFor(fr.AllowBroadcast());
+        Matrix<ElemType> childGradientValues = Input(inputIndex)->GradientFor(fr.AllowBroadcast());
 
-            size_t rowsc = Input(inputIndex)->GetNumRows(), colsc = Input(inputIndex)->GetNumColsFor(fr.AllowBroadcast());
-            size_t rowsp = this->GetNumRows(), colsp = this->GetNumColsFor(fr);
+        size_t rowsc = Input(inputIndex)->GetNumRows(), colsc = Input(inputIndex)->GetNumColsFor(fr.AllowBroadcast());
+        size_t rowsp = this->GetNumRows(), colsp = this->GetNumColsFor(fr);
 
         if (colsc == colsp && rowsc == rowsp) // matching dimensions
-            {
-                // BUGBUG: if we reduce from a frame of a MB into a one-column vector, then we must also mask gaps
-                if (sign > 0)
-                    childGradientValues += gradientValues;
-                else
-                    childGradientValues -= gradientValues;
-            }
+        {
+            // BUGBUG: if we reduce from a frame of a MB into a one-column vector, then we must also mask gaps
+            if (sign > 0)
+                childGradientValues += gradientValues;
+            else
+                childGradientValues -= gradientValues;
+        }
         else if (colsc == 1 && rowsc == 1) // child is a scalar (1 x 1)
-            {
+        {
             MaskMissingGradientColumnsToZero(fr); // reducing over frames, so we must zero out the gaps
-                if (sign > 0)
-                    childGradientValues += gradientValues.SumOfElements();
-                else
-                    childGradientValues -= gradientValues.SumOfElements();
-            }
+            if (sign > 0)
+                childGradientValues += gradientValues.SumOfElements();
+            else
+                childGradientValues -= gradientValues.SumOfElements();
+        }
         else if (colsc == 1 && colsp != 1) // child is broadcasting column vector
-            {
-                size_t colspExpand = rowsp * colsp / rowsc;
+        {
+            size_t colspExpand = rowsp * colsp / rowsc;
             MaskMissingGradientColumnsToZero(fr); // reducing over frames, so we must zero out the gaps
-                Matrix<ElemType>::MultiplyAndWeightedAdd(sign, gradientValues.Reshaped(rowsc, colspExpand), false, ConstOnes(colspExpand, 1, Value().GetDeviceId()), false, 1, childGradientValues);
-            }
+            Matrix<ElemType>::MultiplyAndWeightedAdd(sign, gradientValues.Reshaped(rowsc, colspExpand), false, ConstOnes(colspExpand, 1, Value().GetDeviceId()), false, 1, childGradientValues);
+        }
         else if (rowsc == 1 && rowsp != 1) // child is a broadcasting row vector
-            {
-                Matrix<ElemType>::MultiplyAndWeightedAdd(sign, ConstOnes(1, rowsp, Value().GetDeviceId()), false, gradientValues, false, 1, childGradientValues);
-            }
-            else
-                LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+        {
+            Matrix<ElemType>::MultiplyAndWeightedAdd(sign, ConstOnes(1, rowsp, Value().GetDeviceId()), false, gradientValues, false, 1, childGradientValues);
         }
+        else
+            LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            Matrix<ElemType> functionValues = ValueFor(fr);
-            Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
-            Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
+    {
+        Matrix<ElemType> functionValues = ValueFor(fr);
+        Matrix<ElemType> inputFunctionValues0 = Input(0)->ValueFor(fr.AllowBroadcast());
+        Matrix<ElemType> inputFunctionValues1 = Input(1)->ValueFor(fr.AllowBroadcast());
 
-            size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
-            size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
+        size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
+        size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
         functionValues.VerifySize(max(rows0, rows1), max(cols0, cols1));
 
         if ((rows0 == rows1 && cols0 == cols1 /*match*/) || ((rows0 == 1 || rows1 == 1) /*one is a broadcasting row vector*/ && cols0 == cols1))
-            {
-                functionValues.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1);
-            }
+        {
+            functionValues.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1);
+        }
         else if (cols0 == 1 && rows1 % rows0 == 0) // one is col vec with divisable rows, including scalar
-            {
-                functionValues.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1.Reshaped(rows0, rows1 * cols1 / rows0));
+        {
+            functionValues.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1.Reshaped(rows0, rows1 * cols1 / rows0));
             functionValues.Reshape(max(rows0, rows1), max(cols0, cols1));
-            }
+        }
         else if (cols1 == 1 && rows0 % rows1 == 0) // one is col vec with divisable rows, including scalar
-            {
-                functionValues.AssignDifferenceOf(inputFunctionValues0.Reshaped(rows1, rows0 * cols0 / rows1), inputFunctionValues1);
-                functionValues.Reshape(max(rows0, rows1), max(cols0, cols1));
-            }
-            else
-                LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+        {
+            functionValues.AssignDifferenceOf(inputFunctionValues0.Reshaped(rows1, rows0 * cols0 / rows1), inputFunctionValues1);
+            functionValues.Reshape(max(rows0, rows1), max(cols0, cols1));
         }
-    };
+        else
+            LogicError("%ls %ls operation's Validate() function let invalid dimensions slip by.", NodeName().c_str(), OperationName().c_str());
+    }
+};
 
-    template class MinusNode<float>; 
-    template class MinusNode<double>;
+template class MinusNode<float>;
+template class MinusNode<double>;
 
-    // -----------------------------------------------------------------------
-    // ElementTimesNode (factor1, factor2)
-    //
-    // This allows broadcasting, and can thus also scale with a row, a column, or a scalar.
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// ElementTimesNode (factor1, factor2)
+//
+// This allows broadcasting, and can thus also scale with a row, a column, or a scalar.
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class ElementTimesNode : public BinaryElementWiseNode<ElemType>
-    {
+class ElementTimesNode : public BinaryElementWiseNode<ElemType>
+{
     typedef BinaryElementWiseNode<ElemType> Base;
     UsingBinaryElementwiseNodeBaseMembers;
     static const std::wstring TypeName()
@@ -282,24 +282,24 @@ template <class ElemType>
         return L"ElementTimes";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ElementTimesNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(ElementTimesNode);
     ElementTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
-            Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+    {
+        Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
+        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
         Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
 
-            // depending on inputIndex, all the input variables change meaning
-            // inputIndex == 0 (left) -  inputGradientValues[0], inputFunctionValues[1]
-            // inputIndex == 1 (right) - inputGradientValues[1], inputFunctionValues[0]
-            sliceInput0Grad.AddElementProductOf(sliceOutputGrad, sliceInput1Value);
-        }
+        // depending on inputIndex, all the input variables change meaning
+        // inputIndex == 0 (left) -  inputGradientValues[0], inputFunctionValues[1]
+        // inputIndex == 1 (right) - inputGradientValues[1], inputFunctionValues[0]
+        sliceInput0Grad.AddElementProductOf(sliceOutputGrad, sliceInput1Value);
+    }
 
     virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
     {
@@ -307,28 +307,28 @@ template <class ElemType>
     }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+    {
+        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
-            //ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
-            sliceOutputValue.AssignElementProductOf(sliceInput0Value, sliceInput1Value);
-        }
-    };
+        //ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
+        sliceOutputValue.AssignElementProductOf(sliceInput0Value, sliceInput1Value);
+    }
+};
 
-    template class ElementTimesNode<float>; 
-    template class ElementTimesNode<double>;
+template class ElementTimesNode<float>;
+template class ElementTimesNode<double>;
 
-    // -----------------------------------------------------------------------
-    // ScaleNode (scalar scaling factor, matrix)
-    //
-    // Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// ScaleNode (scalar scaling factor, matrix)
+//
+// Identical to ElementTimesNode with tensor lib (broadcasting). Can be removed.
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
+class ScaleNode : public ComputationNode<ElemType>, public NumInputs<2>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -336,92 +336,92 @@ template <class ElemType>
         return L"Scale";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ScaleNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(ScaleNode);
     ScaleNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
+    {
 #ifdef ENABLE_TENSORVIEW // This takes a big perf hit since our reduction uses only a single thread in this case. Needs to be fixed.
-            size_t rank = DetermineElementwiseTensorRank();
-            auto gradient = GradientTensorFor(rank, fr);
-            auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
-            auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
-
-            // if reduction then mask the respective input(s) (zero out the gaps)
-            if (Input(inputIndex)->GetNumCols() < GetNumCols())
-                MaskMissingGradientColumnsToZero(fr);
-            if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
-                Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
-
-            inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
+        size_t rank = DetermineElementwiseTensorRank();
+        auto gradient = GradientTensorFor(rank, fr);
+        auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
+        auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+        // if reduction then mask the respective input(s) (zero out the gaps)
+        if (Input(inputIndex)->GetNumCols() < GetNumCols())
+            MaskMissingGradientColumnsToZero(fr);
+        if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
+            Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
+
+        inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
 #else
         if (inputIndex == 0) // left derivative
-            {
-                // this is a reduction over frames, so we must mask gaps to zero
-                Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
-            }
+        {
+            // this is a reduction over frames, so we must mask gaps to zero
+            Input(0)->Gradient() += Matrix<ElemType>::InnerProductOfMatrices(MaskedGradientFor(fr), Input(1)->MaskedValueFor(fr)); // element-wise product summed up over all
+        }
         else if (inputIndex == 1) // right derivative
-            {
-                Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
+        {
+            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
             Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Input(0)->Value() /*1x1*/, GradientFor(fr), 1.0f, sliceInput1Grad);
-            }
-#endif
         }
+#endif
+    }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ScaleNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ScaleNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
+    {
 #ifdef ENABLE_TENSORVIEW
         static int c = 0;
         if (c++ == 0)
         {
             fprintf(stderr, "#SCALE#\n");
         }
-            size_t rank = DetermineElementwiseTensorRank();
-            auto result = ValueTensorFor(rank, fr);
-            auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
-            auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-            result.AssignElementwiseProductOf(input0, input1);
+        size_t rank = DetermineElementwiseTensorRank();
+        auto result = ValueTensorFor(rank, fr);
+        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        result.AssignElementwiseProductOf(input0, input1);
 #else
         ValueFor(fr).Assign1x1ProductOf(Input(0)->Value() /*1x1*/, Input(1)->ValueFor(fr));
 #endif
-        }
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            // left node must be a scalar
-            if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
-                RuntimeError("The left value of ScaleNode must be a scalar value.");
+        // left node must be a scalar
+        if (isFinalValidationPass && (Input(0)->GetNumRows() != 1 || Input(0)->GetNumCols() != 1))
+            RuntimeError("The left value of ScaleNode must be a scalar value.");
 
-            SetDims(Input(1));
-        }
-    };
+        SetDims(Input(1));
+    }
+};
 
-    template class ScaleNode<float>; 
-    template class ScaleNode<double>;
+template class ScaleNode<float>;
+template class ScaleNode<double>;
 
-    // -----------------------------------------------------------------------
-    // RowElementTimesNode (left, right)  --TODO: what are left and right?
-    //
-    // TODO: This is subsumed by ElementTimes with tensor lib.
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// RowElementTimesNode (left, right)  --TODO: what are left and right?
+//
+// TODO: This is subsumed by ElementTimes with tensor lib.
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
+class RowElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -429,154 +429,154 @@ template <class ElemType>
         return L"RowElementTimes";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(RowElementTimesNode);
     RowElementTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        void BackpropToMap(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                InvalidArgument("RowElementTimes operation only takes two inputs.");
+    void BackpropToMap(const size_t inputIndex)
+    {
+        if (inputIndex > 1)
+            InvalidArgument("RowElementTimes operation only takes two inputs.");
 
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
-            }
+        if (inputIndex == 0)
+        {
+            BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
         }
+        else
+        {
+            BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
+        }
+    }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
+    {
         if (fr.IsAllFrames())
         {
             BackpropToMap(inputIndex);
             return;
         } // TODO: remove these one by one
-            Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceInput0Grad = Input(inputIndex)->GradientFor(fr);
+        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
 
-            Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
+        Matrix<ElemType> sliceInput1Value = Input(1 - inputIndex)->ValueFor(fr);
 
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
+        if (inputIndex == 0)
+        {
+            BackpropToLeftS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
         }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        else
         {
-            // The RowElementTimesNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
+            BackpropToRightS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
         }
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The RowElementTimesNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
-        //left (input 0) is a matrix
+    //left (input 0) is a matrix
     /*TODO: merge with call site*/ void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
-            Matrix<ElemType>& input0GradientValues, 
-            const Matrix<ElemType>& gradientValues, 
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.SetValue(gradientValues);
-            tempMatrix.RowElementMultiplyWith(input1FunctionValues);
-            input0GradientValues += tempMatrix;
+                                                        Matrix<ElemType>& input0GradientValues,
+                                                        const Matrix<ElemType>& gradientValues,
+                                                        Matrix<ElemType>& tempMatrix)
+    {
+        tempMatrix.SetValue(gradientValues);
+        tempMatrix.RowElementMultiplyWith(input1FunctionValues);
+        input0GradientValues += tempMatrix;
 
 #if NANCHECK
-            input0GradientValues.HasNan("RowElementTimes");
+        input0GradientValues.HasNan("RowElementTimes");
 #endif
-        }
+    }
 
-        //right (input 1) is a row vector
+    //right (input 1) is a row vector
     /*TODO: merge with call site*/ void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
-            Matrix<ElemType>& input1GradientValues, 
-            const Matrix<ElemType>& gradientValues, 
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
-            input1GradientValues += tempMatrix;
+                                                         Matrix<ElemType>& input1GradientValues,
+                                                         const Matrix<ElemType>& gradientValues,
+                                                         Matrix<ElemType>& tempMatrix)
+    {
+        tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, true);
+        input1GradientValues += tempMatrix;
 
 #if NANCHECK
-            input1GradientValues.HasNan("RowElementTimes");
+        input1GradientValues.HasNan("RowElementTimes");
 #endif
-        }
+    }
     void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-        {
-            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
-        }
+    {
+        ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+    {
+        //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
+        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
-            ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
-        }
+        ForwardPropS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
+    }
 
     /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
-        {
-            functionValues.SetValue(input0);
-            functionValues.RowElementMultiplyWith(input1);
+    {
+        functionValues.SetValue(input0);
+        functionValues.RowElementMultiplyWith(input1);
 
 #if NANCHECK
-            functionValues.HasNan("RowElementTimes");
+        functionValues.HasNan("RowElementTimes");
 #endif
-        }
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
+        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
         size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
         rows0;
-            if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
-                LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
+        if (isFinalValidationPass && cols0 != cols1 || rows1 != 1)
+            LogicError("RowElementTimes: Either the second operand is not a row vector or the number of columns of operands does not match.");
 
-            SetDims(Input(0));
-        }
+        SetDims(Input(0));
+    }
 
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-        {
-            Base::RequestMatricesBeforeBackprop(matrixPool);
-            RequestMatrixFromPool(m_tempMatrix, matrixPool);
-        }
+    //request matrices that are needed for gradient computation
+    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+    {
+        Base::RequestMatricesBeforeBackprop(matrixPool);
+        RequestMatrixFromPool(m_tempMatrix, matrixPool);
+    }
 
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-        }
+    //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_tempMatrix, matrixPool);
+    }
 
-    private:
-        shared_ptr<Matrix<ElemType>> m_tempMatrix;
-    };
+private:
+    shared_ptr<Matrix<ElemType>> m_tempMatrix;
+};
 
-    template class RowElementTimesNode<float>;
-    template class RowElementTimesNode<double>;
+template class RowElementTimesNode<float>;
+template class RowElementTimesNode<double>;
 
-    // -----------------------------------------------------------------------
-    // ColumnElementTimesNode (left, right)  --TODO: what are left and right?
-    //
-    // TODO: This is subsumed by ElementTimes with tensor lib.
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// ColumnElementTimesNode (left, right)  --TODO: what are left and right?
+//
+// TODO: This is subsumed by ElementTimes with tensor lib.
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
-    {
+class ColumnElementTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -584,159 +584,159 @@ template <class ElemType>
         return L"ColumnElementTimes";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(ColumnElementTimesNode);
     ColumnElementTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        void BackpropToMap(const size_t inputIndex)
-        {
-            if (inputIndex > 1)
-                InvalidArgument("ColumnElementTimes operation only takes two inputs.");
+    void BackpropToMap(const size_t inputIndex)
+    {
+        if (inputIndex > 1)
+            InvalidArgument("ColumnElementTimes operation only takes two inputs.");
 
-            if (inputIndex == 0)
-            {
-                BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
-            }
-            else
-            {
-                BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
-            }
+        if (inputIndex == 0)
+        {
+            BackpropToLeftS(Input(1)->Value(), Input(0)->Gradient(), Gradient(), *m_tempMatrix);
+        }
+        else
+        {
+            BackpropToRightS(Input(0)->Value(), Input(1)->Gradient(), Gradient(), *m_tempMatrix);
         }
+    }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
+    {
         if (fr.IsAllFrames())
         {
             BackpropToMap(inputIndex);
             return;
         } // TODO: remove these one by one
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
 
-            if (inputIndex == 0)
-            {
-                Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
+        if (inputIndex == 0)
+        {
+            Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
 
-                BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
-            }
-            else
-            {
-                Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-                BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
-            }
+            BackpropToLeftS(Input(1)->Value(), sliceInput0Grad, sliceOutputGrad, *m_tempMatrix);
         }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
+        else
         {
-            // The ColumnElementTimesNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
+            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+            BackpropToRightS(sliceInput0Value, Input(1)->Gradient(), sliceOutputGrad, *m_tempMatrix);
         }
+    }
 
-        //left (input 0) is a matrix
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ColumnElementTimesNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
+
+    //left (input 0) is a matrix
     /*TODO: merge with call site*/ void BackpropToLeftS(Matrix<ElemType>& input1FunctionValues,
-            Matrix<ElemType>& input0GradientValues,
-            const Matrix<ElemType>& gradientValues,
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.SetValue(gradientValues);
-            tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
-            input0GradientValues += tempMatrix;
+                                                        Matrix<ElemType>& input0GradientValues,
+                                                        const Matrix<ElemType>& gradientValues,
+                                                        Matrix<ElemType>& tempMatrix)
+    {
+        tempMatrix.SetValue(gradientValues);
+        tempMatrix.ColumnElementMultiplyWith(input1FunctionValues);
+        input0GradientValues += tempMatrix;
 
 #if NANCHECK
-            input0GradientValues.HasNan("ColumnElementTimes");
+        input0GradientValues.HasNan("ColumnElementTimes");
 #endif
-        }
+    }
 
-        //right (input 1) is a col vector
+    //right (input 1) is a col vector
     /*TODO: merge with call site*/ void BackpropToRightS(Matrix<ElemType>& input0FunctionValues,
-            Matrix<ElemType>& input1GradientValues,
-            const Matrix<ElemType>& gradientValues,
-            Matrix<ElemType>& tempMatrix)
-        {
-            tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
-            input1GradientValues += tempMatrix;
+                                                         Matrix<ElemType>& input1GradientValues,
+                                                         const Matrix<ElemType>& gradientValues,
+                                                         Matrix<ElemType>& tempMatrix)
+    {
+        tempMatrix.AssignInnerProductOf(gradientValues, input0FunctionValues, false);
+        input1GradientValues += tempMatrix;
 
 #if NANCHECK
-            input1GradientValues.HasNan("ColumnElementTimes");
+        input1GradientValues.HasNan("ColumnElementTimes");
 #endif
-        }
+    }
     void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-        {
-            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
-        }
+    {
+        ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+    }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+    {
+        //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
+        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
-            ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
-        }
+        ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value());
+    }
 
     /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)
-        {
-            functionValues.SetValue(input0);
-            functionValues.ColumnElementMultiplyWith(input1);
+    {
+        functionValues.SetValue(input0);
+        functionValues.ColumnElementMultiplyWith(input1);
 
 #if NANCHECK
-            functionValues.HasNan("ColumnElementTimes");
+        functionValues.HasNan("ColumnElementTimes");
 #endif
-        }
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            //derive number of rows if possible
-            for (size_t index = 0; index < 2; index++)
-            {
-                size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
-                size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
-                ValidateInferInputDimsFrom(index, rows, cols);
-            }
+        //derive number of rows if possible
+        for (size_t index = 0; index < 2; index++)
+        {
+            size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
+            size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
+            ValidateInferInputDimsFrom(index, rows, cols);
+        }
 
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
+        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
         size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
         cols0;
-            if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
-                LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
+        if (isFinalValidationPass && (rows0 != rows1 || cols1 != 1))
+            LogicError("ColumnElementTimes: Either the second operand is not a column vector or the number of rows of operands does not match.");
 
-            SetDims(Input(0));
-        }
+        SetDims(Input(0));
+    }
 
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-        {
-            Base::RequestMatricesBeforeBackprop(matrixPool);
-            RequestMatrixFromPool(m_tempMatrix, matrixPool);
-        }
+    //request matrices that are needed for gradient computation
+    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+    {
+        Base::RequestMatricesBeforeBackprop(matrixPool);
+        RequestMatrixFromPool(m_tempMatrix, matrixPool);
+    }
 
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-        }
+    //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_tempMatrix, matrixPool);
+    }
 
-    private:
-        shared_ptr<Matrix<ElemType>> m_tempMatrix;
-    };
+private:
+    shared_ptr<Matrix<ElemType>> m_tempMatrix;
+};
 
-    template class ColumnElementTimesNode<float>;
-    template class ColumnElementTimesNode<double>;
+template class ColumnElementTimesNode<float>;
+template class ColumnElementTimesNode<double>;
 
-    // -----------------------------------------------------------------------
-    // RectifiedLinearNode (input) -- ReLU non-linearity
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// RectifiedLinearNode (input) -- ReLU non-linearity
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class RectifiedLinearNode : public SoftmaxNodeBase<ElemType>
-    {
+class RectifiedLinearNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -744,51 +744,51 @@ template <class ElemType>
         return L"RectifiedLinear";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(RectifiedLinearNode);
     RectifiedLinearNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
-        {
-            gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
+    void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
+    {
+        gradient.AssignLinearRectifierDerivativeOf(inputFunctionValues);
 #if DUMPOUTPUT
-            inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
+        inputGradientValues.Print("RecitifiedLinearNode-Partial-in");
 #endif
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
+        inputGradientValues.AddElementProductOf(gradientValues, gradient);
 #if DUMPOUTPUT
-            inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
+        inputGradientValues.Print("RecitifiedLinearNode-Partial-out");
 #endif
-        }
+    }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ReLU node does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ReLU node does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
-        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
+    void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignTruncateBottomOf(inputFunctionValues, 0);
 #if DUMPOUTPUT
-            functionValues.Print("RectifiedLinearNode");
+        functionValues.Print("RectifiedLinearNode");
 #endif
-        }
-    };
+    }
+};
 
-    template class RectifiedLinearNode<float>;
-    template class RectifiedLinearNode<double>;
+template class RectifiedLinearNode<float>;
+template class RectifiedLinearNode<double>;
 
-    // -----------------------------------------------------------------------
-    // SigmoidNode (input) -- sigmoid non-linearity
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// SigmoidNode (input) -- sigmoid non-linearity
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class SigmoidNode : public SoftmaxNodeBase<ElemType>
-    {
+class SigmoidNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -796,43 +796,43 @@ template <class ElemType>
         return L"Sigmoid";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(SigmoidNode);
     SigmoidNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The Sigmoid node does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        // The Sigmoid node does not require any of it's input's values for computing
+        // the gradients of its input nodes
+        UNREFERENCED_PARAMETER(childIndex);
+        return false;
+    }
 
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignSigmoidDerivativeOf(functionValues);
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-        }
+    /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+    {
+        gradient.AssignSigmoidDerivativeOf(functionValues);
+        inputGradientValues.AddElementProductOf(gradientValues, gradient);
+    }
 
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignSigmoidOf(inputFunctionValues);
-        }
-    };
+    /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignSigmoidOf(inputFunctionValues);
+    }
+};
 
-    template class SigmoidNode<float>;
-    template class SigmoidNode<double>;
+template class SigmoidNode<float>;
+template class SigmoidNode<double>;
 
-    // -----------------------------------------------------------------------
-    // TanhNode (input) -- tanh non-linearity
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// TanhNode (input) -- tanh non-linearity
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class TanhNode : public SoftmaxNodeBase<ElemType>
-    {
+class TanhNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -840,45 +840,45 @@ template <class ElemType>
         return L"Tanh";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(TanhNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(TanhNode);
     TanhNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The plus node does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        // The plus node does not require any of it's input's values for computing
+        // the gradients of its input nodes
+        UNREFERENCED_PARAMETER(childIndex);
+        return false;
+    }
 
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
+    /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+    {
+        gradient.AssignElementProductOf(functionValues, functionValues); // v .* v
         gradient.AssignDifferenceOf(1, gradient);                        // 1-v^2
 
-            inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
-        }
+        inputGradientValues.AddElementProductOf(gradientValues, gradient); // += d .* ((1-v) .* v))
+    }
 
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignTanhOf(inputFunctionValues);
-        }
-    };
+    /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignTanhOf(inputFunctionValues);
+    }
+};
 
-    template class TanhNode<float>;
-    template class TanhNode<double>;
+template class TanhNode<float>;
+template class TanhNode<double>;
 
-    // -----------------------------------------------------------------------
-    // LogNode (input) -- component-wise log() of input
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// LogNode (input) -- component-wise log() of input
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class LogNode : public SoftmaxNodeBase<ElemType>
-    {
+class LogNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -886,44 +886,44 @@ template <class ElemType>
         return L"Log";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(LogNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(LogNode);
     LogNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The plus node does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The plus node does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-            // TODO: with tensor lib:
-            //inputGradientValues.AddElementDivisionOf(gradientValues, inputFunctionValues); // 1/x (x is input to log(x))
-        }
+    /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
+    {
+        gradient.AssignElementInverseOf(inputFunctionValues); // 1/x (x is input to log(x))
+        inputGradientValues.AddElementProductOf(gradientValues, gradient);
+        // TODO: with tensor lib:
+        //inputGradientValues.AddElementDivisionOf(gradientValues, inputFunctionValues); // 1/x (x is input to log(x))
+    }
 
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignLogOf(inputFunctionValues);
-        }
-    };
+    /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignLogOf(inputFunctionValues);
+    }
+};
 
-    template class LogNode<float>;
-    template class LogNode<double>;
+template class LogNode<float>;
+template class LogNode<double>;
 
-    // -----------------------------------------------------------------------
-    // ExpNode (input) -- component-wise exp() of input
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// ExpNode (input) -- component-wise exp() of input
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class ExpNode : public SoftmaxNodeBase<ElemType>
-    {
+class ExpNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -931,57 +931,57 @@ template <class ElemType>
         return L"Exp";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ExpNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(ExpNode);
     ExpNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-        {
+    {
         assert(inputIndex == 0);
         inputIndex;
 
-            Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-            Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
+        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
 
-            m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
-            sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
-            // TODO: with tensor lib:
-            // sliceInputGrad.AddElementProductOf(sliceOutputGrad, functionValues);
-            // and set OutputUsed
-        }
+        m_gradientTemp->AssignExpOf(sliceInputValue); // Exp(x) is its own partial
+        sliceInputGrad.AddElementProductOf(sliceOutputGrad, *m_gradientTemp);
+        // TODO: with tensor lib:
+        // sliceInputGrad.AddElementProductOf(sliceOutputGrad, functionValues);
+        // and set OutputUsed
+    }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ExpNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ExpNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
     virtual void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
     {
         NOT_IMPLEMENTED;
     } // not needed
 
-        void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignExpOf(inputFunctionValues);
-        }
-    };
+    void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignExpOf(inputFunctionValues);
+    }
+};
 
-    template class ExpNode<float>;
-    template class ExpNode<double>;
+template class ExpNode<float>;
+template class ExpNode<double>;
 
-    // -----------------------------------------------------------------------
-    // CosineNode (input) -- component-wise cos() of input
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// CosineNode (input) -- component-wise cos() of input
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    class CosineNode : public SoftmaxNodeBase<ElemType>
-    {
+class CosineNode : public SoftmaxNodeBase<ElemType>
+{
     typedef SoftmaxNodeBase<ElemType> Base;
     UsingSoftmaxNodeBaseMembers;
     static const std::wstring TypeName()
@@ -989,54 +989,54 @@ template <class ElemType>
         return L"Cosine";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(CosineNode);
-    CosineNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
+public:
+    DeclareConstructorFromConfigWithNumInputs(CosineNode);
+    CosineNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The CosineNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
+
+    /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
     {
+        gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
+        inputGradientValues.AddElementProductOf(gradientValues, gradient);
+        // TODO: tensor lib: make a joint kernel, since neg sin is never used for anything else
     }
 
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The CosineNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
-
-        /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
-        {
-            gradient.AssignNegativeSineOf(inputFunctionValues); // -sin(x) (x is input to Cosine(x))
-            inputGradientValues.AddElementProductOf(gradientValues, gradient);
-            // TODO: tensor lib: make a joint kernel, since neg sin is never used for anything else
-        }
-
-        /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
-        {
-            functionValues.AssignCosineOf(inputFunctionValues);
-        }
-    };
+    /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
+    {
+        functionValues.AssignCosineOf(inputFunctionValues);
+    }
+};
 
-    template class CosineNode<float>;
-    template class CosineNode<double>;
+template class CosineNode<float>;
+template class CosineNode<double>;
 #endif
 
-    // -----------------------------------------------------------------------
-    /// DummyCriterionNode (objectives, derivatives, prediction)
-    // -----------------------------------------------------------------------
-
-    // This training criterion node needs derivatives and objectives to be
-    // computed out of the node. Derivatives and objectives will be fed to the
-    // node as input features. It has 3 inputs:
-    // 1. feature node that feeds objectives
-    // 2. feature node that feeds derivatives
-    // 3. neural network output
-    //
-    // This node is useful in sequence training for speech recognition, so that
-    // we can separate lattice computation (which may rely other softwares, such
-    // as Kaldi) with the neural network training.
+// -----------------------------------------------------------------------
+/// DummyCriterionNode (objectives, derivatives, prediction)
+// -----------------------------------------------------------------------
+
+// This training criterion node needs derivatives and objectives to be
+// computed out of the node. Derivatives and objectives will be fed to the
+// node as input features. It has 3 inputs:
+// 1. feature node that feeds objectives
+// 2. feature node that feeds derivatives
+// 3. neural network output
+//
+// This node is useful in sequence training for speech recognition, so that
+// we can separate lattice computation (which may rely other softwares, such
+// as Kaldi) with the neural network training.
 template <class ElemType>
 class DummyCriterionNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<3>
-    {
+{
     typedef ComputationNodeNonLooping<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -1044,27 +1044,27 @@ class DummyCriterionNode : public ComputationNodeNonLooping /*ComputationNode*/<
         return L"DummyCriterion";
     }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(DummyCriterionNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(DummyCriterionNode);
     DummyCriterionNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
-        virtual void BackpropToNonLooping(size_t inputIndex) override
-        {
-            FrameRange fr(Input(0)->GetMBLayout());
-            if (inputIndex == 0)
-                LogicError("DummyCriterionNode: derivatives with respect to objective features are not necessary, not implemented yet.\n");
-            else if (inputIndex == 1)
-                LogicError("DummyCriterionNode: derivatives with respect to derivative features are not necessary, not implemented yet.\n");
-            else if (inputIndex == 2)
-            {
-                auto gradient = Input(2)->GradientFor(fr);
-                //Matrix<ElemType>::ScaleAndAdd(Gradient().Get00Element(), Input(1)->ValueFor(fr), gradient);
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        FrameRange fr(Input(0)->GetMBLayout());
+        if (inputIndex == 0)
+            LogicError("DummyCriterionNode: derivatives with respect to objective features are not necessary, not implemented yet.\n");
+        else if (inputIndex == 1)
+            LogicError("DummyCriterionNode: derivatives with respect to derivative features are not necessary, not implemented yet.\n");
+        else if (inputIndex == 2)
+        {
+            auto gradient = Input(2)->GradientFor(fr);
+            //Matrix<ElemType>::ScaleAndAdd(Gradient().Get00Element(), Input(1)->ValueFor(fr), gradient);
             Matrix<ElemType>::Multiply1x1AndWeightedAdd(+1.0f, Gradient() /*1x1*/, Input(1)->ValueFor(fr), 1.0f, gradient);
-            }
         }
+    }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
@@ -1072,55 +1072,55 @@ class DummyCriterionNode : public ComputationNodeNonLooping /*ComputationNode*/<
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
-        {
-            Value().VerifySize(1, 1);
-            Input(0)->Value().VerifySize(1, 1);
-            Value().SetValue(Input(0)->Value());
+    {
+        Value().VerifySize(1, 1);
+        Input(0)->Value().VerifySize(1, 1);
+        Value().SetValue(Input(0)->Value());
 #if NANCHECK
-            Value().HasNan("DummyCriterionNode");
+        Value().HasNan("DummyCriterionNode");
 #endif
-        }
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
+    {
+        Base::Validate(isFinalValidationPass);
         m_pMBLayout = nullptr; // this node does not hold mini-batch data
 
-            if (Input(0)->OperationName() != L"InputValue")
-                LogicError("DummyCriterionNode criterion requires the first input to be computed objectives.");
-            if (Input(0)->OperationName() != L"InputValue")
-                LogicError("DummyCriterionNode criterion requires the first input to be computed derivatives.");
-            if (isFinalValidationPass)
-            {
-                if (Input(0)->GetSampleMatrixNumRows() != 1)
-                    LogicError("DummyCriterionNode criterion requires the first input to have dimension 1.");
-                if (Input(0)->GetSampleMatrixNumRows() == 0 || Input(1)->GetSampleMatrixNumRows() == 0 || Input(2)->GetSampleMatrixNumRows() == 0)
-                    LogicError("DummyCriterionNode operation: one of the operands has 0 elements.");
-                if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetSampleMatrixNumRows())
+        if (Input(0)->OperationName() != L"InputValue")
+            LogicError("DummyCriterionNode criterion requires the first input to be computed objectives.");
+        if (Input(0)->OperationName() != L"InputValue")
+            LogicError("DummyCriterionNode criterion requires the first input to be computed derivatives.");
+        if (isFinalValidationPass)
+        {
+            if (Input(0)->GetSampleMatrixNumRows() != 1)
+                LogicError("DummyCriterionNode criterion requires the first input to have dimension 1.");
+            if (Input(0)->GetSampleMatrixNumRows() == 0 || Input(1)->GetSampleMatrixNumRows() == 0 || Input(2)->GetSampleMatrixNumRows() == 0)
+                LogicError("DummyCriterionNode operation: one of the operands has 0 elements.");
+            if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetSampleMatrixNumRows())
                 LogicError("The Matrix dimension in the DummyCriterionNode operation does not match.");
-            }
-
-            SetDims(TensorShape(1), false);
         }
-    };
 
-    template class DummyCriterionNode<float>; 
-    template class DummyCriterionNode<double>;
-
-    // -----------------------------------------------------------------------
-    // SequenceDecoderNode (label, position_dependent_score, transition_score)
-    // this node does sequence decoding only
-    // it corresponds to a decoder
-    //  - label : output label vector of [0:T-1]
-    //  - position_dependent_score : score from position dependent node,
-    //    in the R-CRF case, it is the RNN output score before softmax
-    //  - transition score : score from the transition node, 
-    //    in the R-CRF case, it is the transition probability between labels
-    // -----------------------------------------------------------------------
+        SetDims(TensorShape(1), false);
+    }
+};
+
+template class DummyCriterionNode<float>;
+template class DummyCriterionNode<double>;
+
+// -----------------------------------------------------------------------
+// SequenceDecoderNode (label, position_dependent_score, transition_score)
+// this node does sequence decoding only
+// it corresponds to a decoder
+//  - label : output label vector of [0:T-1]
+//  - position_dependent_score : score from position dependent node,
+//    in the R-CRF case, it is the RNN output score before softmax
+//  - transition score : score from the transition node,
+//    in the R-CRF case, it is the transition probability between labels
+// -----------------------------------------------------------------------
 
 template <class ElemType>
 class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<3>
-    {
+{
     typedef ComputationNodeNonLooping<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -1128,17 +1128,17 @@ class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/
         return L"SequenceDecoderNode";
     }
 
-    private:
-        // TODO: member variables go to the end
-        Matrix<ElemType> mAlpha;
-        Matrix<ElemType> mBacktrace;
+private:
+    // TODO: member variables go to the end
+    Matrix<ElemType> mAlpha;
+    Matrix<ElemType> mBacktrace;
 
-        int mStartLab; // the starting output label
-        int mEndLab;   // the ending output label, if avaliable
+    int mStartLab; // the starting output label
+    int mEndLab;   // the ending output label, if avaliable
     ElemType m_default_activity;
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(SequenceDecoderNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(SequenceDecoderNode);
     SequenceDecoderNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name),
           mAlpha(deviceId),
@@ -1149,36 +1149,36 @@ class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/
     }
 
     static void DecideStartEndingOutputLab(const Matrix<ElemType>& lbls, int& stt, int& stp)
-        {
-            if (stt != -1 && stp != -1)
-                return; /// have computed before
+    {
+        if (stt != -1 && stp != -1)
+            return; /// have computed before
 
-            int iNumPos = lbls.GetNumCols();
+        int iNumPos = lbls.GetNumCols();
 
-            int firstLbl = -1;
-            for (int ik = 0; ik < lbls.GetNumRows(); ik++)
+        int firstLbl = -1;
+        for (int ik = 0; ik < lbls.GetNumRows(); ik++)
             if (lbls(ik, 0) != 0)
             {
                 firstLbl = ik;
                 break;
-                }
+            }
 
-            int lastLbl = -1;
-            for (int ik = 0; ik < lbls.GetNumRows(); ik++)
+        int lastLbl = -1;
+        for (int ik = 0; ik < lbls.GetNumRows(); ik++)
             if (lbls(ik, iNumPos - 1) != 0)
             {
                 lastLbl = ik;
                 break;
-                }
+            }
 
-            stt = firstLbl;
-            stp = lastLbl;
-        };
+        stt = firstLbl;
+        stp = lastLbl;
+    };
 
     virtual void BackpropToNonLooping(size_t /*inputIndex*/) override //scaled by 2*number of elements in the Matrix<ElemType>
-        {
-            LogicError("SequenceDecoder is used for evaluation only.");
-        }
+    {
+        LogicError("SequenceDecoder is used for evaluation only.");
+    }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
@@ -1189,131 +1189,131 @@ class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/
         return false;
     }
 
-        /// compute posterior probability of label y at position t
+    /// compute posterior probability of label y at position t
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
-        {
-            DecideStartEndingOutputLab(Input(0)->Value(), mStartLab, mEndLab);
-            ForwardPropS(mAlpha, mBacktrace, Value(), Input(1)->Value(),
-                              Input(2)->Value(), mStartLab, mEndLab);
-        }
+    {
+        DecideStartEndingOutputLab(Input(0)->Value(), mStartLab, mEndLab);
+        ForwardPropS(mAlpha, mBacktrace, Value(), Input(1)->Value(),
+                     Input(2)->Value(), mStartLab, mEndLab);
+    }
 
-        // compute forward backward algorithm
-        void ForwardPropS(Matrix<ElemType>& alpha, Matrix<ElemType>& backtrace, Matrix<ElemType>& functionValues, const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const size_t stt, const size_t stp)
-        {
-            /// to-do, each slice is for one sentence
-            /// to-do, number of slices correspond to number of frames 
-            /// this implementation only supports one sentence per minibatch
-
-            /// change to other values so can support multiple sentences in each minibatch
-            ForwardCompute(alpha, backtrace, pos_scores, pair_scores, stt);
-            BackwardCompute(functionValues, backtrace, stp);
-        };
-
-        /// compute forward backward algorithm
-        static void ForwardCompute(Matrix<ElemType>& alpha,
-            Matrix<ElemType>& backtrace,
-            const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores,
-            const size_t stt)
-        {
-            /// to-do, shift more than 1 to support muliple sentences per minibatch
-            int iNumPos = pos_scores.GetNumCols();
-            int iNumLab = pos_scores.GetNumRows();
-            size_t iTmp = 0;
+    // compute forward backward algorithm
+    void ForwardPropS(Matrix<ElemType>& alpha, Matrix<ElemType>& backtrace, Matrix<ElemType>& functionValues, const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const size_t stt, const size_t stp)
+    {
+        /// to-do, each slice is for one sentence
+        /// to-do, number of slices correspond to number of frames
+        /// this implementation only supports one sentence per minibatch
 
-            /// need to have 
-            alpha.Resize(iNumLab, iNumPos);
-            backtrace.Resize(iNumLab, iNumPos);
+        /// change to other values so can support multiple sentences in each minibatch
+        ForwardCompute(alpha, backtrace, pos_scores, pair_scores, stt);
+        BackwardCompute(functionValues, backtrace, stp);
+    };
+
+    /// compute forward backward algorithm
+    static void ForwardCompute(Matrix<ElemType>& alpha,
+                               Matrix<ElemType>& backtrace,
+                               const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores,
+                               const size_t stt)
+    {
+        /// to-do, shift more than 1 to support muliple sentences per minibatch
+        int iNumPos = pos_scores.GetNumCols();
+        int iNumLab = pos_scores.GetNumRows();
+        size_t iTmp = 0;
+
+        /// need to have
+        alpha.Resize(iNumLab, iNumPos);
+        backtrace.Resize(iNumLab, iNumPos);
 
-            for (int t = 0; t < iNumPos; t++)
+        for (int t = 0; t < iNumPos; t++)
+        {
+            for (int k = 0; k < iNumLab; k++)
             {
-                for (int k = 0; k < iNumLab; k++)
-                {
                 ElemType fTmp = (ElemType) LZERO;
                 if (t > 1)
                 {
-                        for (int j = 0; j < iNumLab; j++)
-                        {
-                            ElemType fAlpha = alpha(j, t - 1) + pair_scores(k, j);
+                    for (int j = 0; j < iNumLab; j++)
+                    {
+                        ElemType fAlpha = alpha(j, t - 1) + pair_scores(k, j);
                         if (fAlpha > fTmp)
                         {
-                                fTmp = fAlpha;
-                                iTmp = j;
-                            }
+                            fTmp = fAlpha;
+                            iTmp = j;
                         }
-                    fTmp += pos_scores(k, t); /// include position dependent score
                     }
-                    else
-                    {
-                        /// with constrain that the first word is labeled as a given symbol
-                        iTmp = stt;
-                        fTmp = 0;
+                    fTmp += pos_scores(k, t); /// include position dependent score
+                }
+                else
+                {
+                    /// with constrain that the first word is labeled as a given symbol
+                    iTmp = stt;
+                    fTmp = 0;
                     if (t == 1)
                     {
-                            fTmp = alpha(iTmp, t - 1);
-                            fTmp += pair_scores(k, iTmp);
-                            fTmp += pos_scores(k, t);
-                        }
+                        fTmp = alpha(iTmp, t - 1);
+                        fTmp += pair_scores(k, iTmp);
+                        fTmp += pos_scores(k, t);
+                    }
                     else
                     {
                         fTmp = (k == stt) ? pos_scores(k, t) : (ElemType) LZERO;
-                        }
                     }
-                    alpha(k, t) = fTmp;
-                backtrace(k, t) = (ElemType) iTmp;
                 }
+                alpha(k, t) = fTmp;
+                backtrace(k, t) = (ElemType) iTmp;
             }
-        };
-
-        /// compute backward algorithm
-        static void BackwardCompute(
-            Matrix<ElemType>& decodedpath,
-            const Matrix<ElemType>& backtrace, const size_t stp)
-        {
-            int iNumPos = backtrace.GetNumCols();
-            int iNumLab = backtrace.GetNumRows();
+        }
+    };
 
-            decodedpath.Resize(iNumLab, iNumPos);
-            decodedpath.SetValue(0);
+    /// compute backward algorithm
+    static void BackwardCompute(
+        Matrix<ElemType>& decodedpath,
+        const Matrix<ElemType>& backtrace, const size_t stp)
+    {
+        int iNumPos = backtrace.GetNumCols();
+        int iNumLab = backtrace.GetNumRows();
 
-            size_t lastlbl = stp;
-            decodedpath(lastlbl, iNumPos - 1) = 1;
+        decodedpath.Resize(iNumLab, iNumPos);
+        decodedpath.SetValue(0);
 
-            for (int t = iNumPos - 1; t > 0; t--)
-            {
-            lastlbl = (size_t) backtrace(lastlbl, t);
-                decodedpath(lastlbl, t - 1) = 1;
-            }
-        };
+        size_t lastlbl = stp;
+        decodedpath(lastlbl, iNumPos - 1) = 1;
 
-        /// need to feed in pseudo label data, which tells the decoder what is the beginning
-        /// and ending output symbol. these symbols will constrain the search space
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+        for (int t = iNumPos - 1; t > 0; t--)
         {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
-
-            if (isFinalValidationPass)
-                if (!(Input(1)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() && // position dependent and pair scores have same number of labels
-                    Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() &&
-                    Input(0)->GetSampleMatrixNumCols() == Input(1)->GetSampleMatrixNumCols() && // position dependent and pair scores have the same observation numbers
-                    Input(2)->GetSampleMatrixNumCols() == Input(2)->GetSampleMatrixNumRows()))
-                {
-                    LogicError("The Matrix<ElemType>  dimension in the SequenceDecoderNode operation does not match.");
-                }
-            // BUGBUG: No SetDims()?
-            m_sampleLayout = TensorShape();
+            lastlbl = (size_t) backtrace(lastlbl, t);
+            decodedpath(lastlbl, t - 1) = 1;
         }
     };
 
-    template class SequenceDecoderNode<float>;
-    template class SequenceDecoderNode<double>;
+    /// need to feed in pseudo label data, which tells the decoder what is the beginning
+    /// and ending output symbol. these symbols will constrain the search space
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
+
+        if (isFinalValidationPass)
+            if (!(Input(1)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() && // position dependent and pair scores have same number of labels
+                  Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() &&
+                  Input(0)->GetSampleMatrixNumCols() == Input(1)->GetSampleMatrixNumCols() && // position dependent and pair scores have the same observation numbers
+                  Input(2)->GetSampleMatrixNumCols() == Input(2)->GetSampleMatrixNumRows()))
+            {
+                LogicError("The Matrix<ElemType>  dimension in the SequenceDecoderNode operation does not match.");
+            }
+        // BUGBUG: No SetDims()?
+        m_sampleLayout = TensorShape();
+    }
+};
 
-    // -----------------------------------------------------------------------
-    // StrideTimesNode (left, right, stride/*0=row, 1=col*/)
-    // TODO: why is 'stride' an Input and not just an initialization parameter?
-    // -----------------------------------------------------------------------
+template class SequenceDecoderNode<float>;
+template class SequenceDecoderNode<double>;
 
-    /**
+// -----------------------------------------------------------------------
+// StrideTimesNode (left, right, stride/*0=row, 1=col*/)
+// TODO: why is 'stride' an Input and not just an initialization parameter?
+// -----------------------------------------------------------------------
+
+/**
     Has a stride in particular dimensions of left matrix when doing times operation. 
     Example 1: column stride s
     A in d x [s x T1] 
@@ -1332,8 +1332,8 @@ class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/
     Notice that s is equal to k. 
     */
 template <class ElemType>
-    class StrideTimesNode : public ComputationNode<ElemType>, public NumInputs<3>
-    {
+class StrideTimesNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -1341,22 +1341,22 @@ template <class ElemType>
         return L"StrideTimes";
     }
 
-        size_t m_strideDim; // the dimension index on which stride works 
-        size_t m_stride;    // the stride 
-    private:
-        void UpdateStride(const Matrix<ElemType>& input1) 
-        {
-            m_stride = input1.GetNumCols();
-        }
+    size_t m_strideDim; // the dimension index on which stride works
+    size_t m_stride;    // the stride
+private:
+    void UpdateStride(const Matrix<ElemType>& input1)
+    {
+        m_stride = input1.GetNumCols();
+    }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(StrideTimesNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(StrideTimesNode);
     StrideTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name),
-            m_stride(1)
+          m_stride(1)
     {
     }
-        // BUGBUG: This node needs to serialize and CopyTo m_stride
+    // BUGBUG: This node needs to serialize and CopyTo m_stride
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
@@ -1365,165 +1365,165 @@ template <class ElemType>
             NOT_IMPLEMENTED;
             return;
         } // TODO: remove these one by one. And why is this not implemented?
-            if (inputIndex > 2)
-                InvalidArgument("StrideTimes operation only takes three inputs.");
-            else if (inputIndex == 2)
+        if (inputIndex > 2)
+            InvalidArgument("StrideTimes operation only takes three inputs.");
+        else if (inputIndex == 2)
             return; // that's a constant
 
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
 
-            if (m_strideDim == 1) // column stride
-            {
+        if (m_strideDim == 1) // column stride
+        {
             if (inputIndex == 0) //left derivative
-                {
-                    Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+            {
+                Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
 
-                    //BackpropToLeft1(sliceInput1Value, Input(0)->Gradient(), sliceOutputGrad);
+                //BackpropToLeft1(sliceInput1Value, Input(0)->Gradient(), sliceOutputGrad);
 
-                    size_t r = Input(0)->GetSampleMatrixNumRows();
-                    size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
-                    Matrix<ElemType> mTmp1(r, T1, sliceInput1Value.GetDeviceId());
+                size_t r = Input(0)->GetSampleMatrixNumRows();
+                size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
+                Matrix<ElemType> mTmp1(r, T1, sliceInput1Value.GetDeviceId());
 
-                    // process sequence by sequence
-                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
-                    {
-                        mTmp1.SetValue(0);
-                        auto mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
-                        auto mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
+                // process sequence by sequence
+                for (size_t k = 0; k < GetNumParallelSequences(); k++)
+                {
+                    mTmp1.SetValue(0);
+                    auto mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
+                    auto mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
 
-                        BackpropToLeft1(mTmp2, mTmp1, mTmp3);
+                    BackpropToLeft1(mTmp2, mTmp1, mTmp3);
 
-                        for (size_t t = 0; t < T1; t++)
-                        {
+                    for (size_t t = 0; t < T1; t++)
+                    {
                         Input(0)->Gradient().ColumnSlice(t * GetNumParallelSequences() + k, 1) += mTmp1.ColumnSlice(t, 1);
-                        }
                     }
                 }
+            }
             else //right derivative
-                {
-                    Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
+            {
+                Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
 
-                    //BackpropToRight(Input(0)->Value(), sliceInput1Grad, sliceOutputGrad);
+                //BackpropToRight(Input(0)->Value(), sliceInput1Grad, sliceOutputGrad);
 
-                    // process sequence by sequence
-                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
+                // process sequence by sequence
+                for (size_t k = 0; k < GetNumParallelSequences(); k++)
+                {
+                    size_t r = Input(0)->GetSampleMatrixNumRows();
+                    size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
+                    Matrix<ElemType> mTmp1(r, T1, sliceOutputGrad.GetDeviceId());
+                    for (size_t t = 0; t < T1; t++)
                     {
-                        size_t r = Input(0)->GetSampleMatrixNumRows();
-                        size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
-                        Matrix<ElemType> mTmp1(r, T1, sliceOutputGrad.GetDeviceId());
-                        for (size_t t = 0; t < T1; t++)
-                        {
                         mTmp1.ColumnSlice(t, 1).SetValue(Input(0)->Value().ColumnSlice(t * GetNumParallelSequences() + k, 1));
-                        }
-                        auto mTmp2 = sliceInput1Grad.ColumnSlice(k, 1);
-                        auto mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
-
-                        BackpropToRight(mTmp1, mTmp2, mTmp3);
                     }
+                    auto mTmp2 = sliceInput1Grad.ColumnSlice(k, 1);
+                    auto mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
+
+                    BackpropToRight(mTmp1, mTmp2, mTmp3);
                 }
             }
-            else if (m_strideDim == 0) // row stride
-            {
+        }
+        else if (m_strideDim == 0) // row stride
+        {
             if (inputIndex == 0) //left derivative
+            {
+                Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+
+                for (size_t k = 0; k < GetNumParallelSequences(); k++)
                 {
-                    Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+                    size_t d = Input(1)->GetSampleMatrixNumRows();
+                    size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
+                    Matrix<ElemType> mTmp1(sliceInput1Value.GetDeviceId());
+                    mTmp1.Resize(d, T1);
+                    Matrix<ElemType> mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
+                    Matrix<ElemType> mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
+                    BackpropToLeft(mTmp2, mTmp1, mTmp3);
 
-                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
+                    Matrix<ElemType> mTmp4(sliceInput1Value.GetDeviceId());
+                    for (size_t t = 0; t < T1; t++)
                     {
-                        size_t d = Input(1)->GetSampleMatrixNumRows();
-                        size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
-                        Matrix<ElemType> mTmp1(sliceInput1Value.GetDeviceId());
-                        mTmp1.Resize(d, T1);
-                        Matrix<ElemType> mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
-                        Matrix<ElemType> mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
-                        BackpropToLeft(mTmp2, mTmp1, mTmp3);
-
-                        Matrix<ElemType> mTmp4(sliceInput1Value.GetDeviceId());
-                        for (size_t t = 0; t < T1; t++)
-                        {
-                            mTmp4 = mTmp1.ColumnSlice(t, 1);
-                            mTmp4.Reshape(1, d);
+                        mTmp4 = mTmp1.ColumnSlice(t, 1);
+                        mTmp4.Reshape(1, d);
                         Input(0)->Gradient().AddToRowSliceValuesOf(mTmp4, t * GetNumParallelSequences() + k, 1);
-                        }
                     }
                 }
+            }
             else //right derivative
-                {
-                    Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
-
-                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
-                    {
-                        size_t d = Input(1)->GetSampleMatrixNumRows();
-                        size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
+            {
+                Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
 
-                        Matrix<ElemType> mTmp0(sliceOutputGrad.GetDeviceId());
-                        mTmp0.Resize(1, d);
+                for (size_t k = 0; k < GetNumParallelSequences(); k++)
+                {
+                    size_t d = Input(1)->GetSampleMatrixNumRows();
+                    size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
 
-                        Matrix<ElemType> mTmp1(sliceOutputGrad.GetDeviceId());
-                        mTmp1.Resize(T1, d);
-                        for (size_t t = 0; t < T1; t++)
-                        {
-                            mTmp0.SetValue(0);
-                            mTmp0.AddWithRowSliceValuesOf(Input(0)->Value(), t * GetNumParallelSequences() + k, 1);
-                            mTmp1.AssignToRowSliceValuesOf(mTmp0, t, 1);
-                        }
-                        Matrix<ElemType> mTmp2 = sliceInput1Grad.ColumnSlice(k, 1);
-                        Matrix<ElemType> mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
+                    Matrix<ElemType> mTmp0(sliceOutputGrad.GetDeviceId());
+                    mTmp0.Resize(1, d);
 
-                        BackpropToRight(mTmp1, mTmp2, mTmp3);
+                    Matrix<ElemType> mTmp1(sliceOutputGrad.GetDeviceId());
+                    mTmp1.Resize(T1, d);
+                    for (size_t t = 0; t < T1; t++)
+                    {
+                        mTmp0.SetValue(0);
+                        mTmp0.AddWithRowSliceValuesOf(Input(0)->Value(), t * GetNumParallelSequences() + k, 1);
+                        mTmp1.AssignToRowSliceValuesOf(mTmp0, t, 1);
                     }
+                    Matrix<ElemType> mTmp2 = sliceInput1Grad.ColumnSlice(k, 1);
+                    Matrix<ElemType> mTmp3 = sliceOutputGrad.ColumnSlice(k, 1);
+
+                    BackpropToRight(mTmp1, mTmp2, mTmp3);
                 }
             }
         }
+    }
 
-        // TODO: the following two functions only differ in the order of argument use in the final MultiplyAndAdd()  --is that intended??
+    // TODO: the following two functions only differ in the order of argument use in the final MultiplyAndAdd()  --is that intended??
     static /*TODO: merge with call site*/ void BackpropToLeft1(const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
-        {
+    {
 #if DUMPOUTPUT
-            gradientValues.Print("Gradient-in");
-            inputGradientValues.Print("child Gradient-in/out");
-            inputFunctionValues.Print("child Function values");
+        gradientValues.Print("Gradient-in");
+        inputGradientValues.Print("child Gradient-in/out");
+        inputFunctionValues.Print("child Function values");
 #endif
-            //currently we only support one combination when the input is sparse.
-            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
-                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
+        //currently we only support one combination when the input is sparse.
+        if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+            inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
 
-            Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
+        Matrix<ElemType>::MultiplyAndAdd(gradientValues, false, inputFunctionValues, true, inputGradientValues);
 #if DUMPOUTPUT
-            inputGradientValues.Print("child Gradient-out");
+        inputGradientValues.Print("child Gradient-out");
 #endif
-        }
+    }
 
     static /*TODO: merge with call site*/ void BackpropToLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
-        {
-#if DUMPOUTPUT   
-            gradientValues.Print("Gradient-in");   
-            inputGradientValues.Print("child Gradient-in/out");   
-            inputFunctionValues.Print("child Function values");   
+    {
+#if DUMPOUTPUT
+        gradientValues.Print("Gradient-in");
+        inputGradientValues.Print("child Gradient-in/out");
+        inputFunctionValues.Print("child Function values");
 #endif
-            //currently we only support one combination when the input is sparse.   
-            if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
-                inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
+        //currently we only support one combination when the input is sparse.
+        if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
+            inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
 
-            Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
+        Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
 
 #if DUMPOUTPUT
-            inputGradientValues.Print("child Gradient-out");
+        inputGradientValues.Print("child Gradient-out");
 #endif
-        }
+    }
 
     static /*TODO: merge with call site*/ void BackpropToRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
-        {
-#if DUMPOUTPUT   
-            gradientValues.Print("Gradient-in");   
-            inputGradientValues.Print("child Gradient-in/out");
-            inputFunctionValues.Print("child Function values");
-#endif   
-            Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, true, gradientValues, false, inputGradientValues);
+    {
 #if DUMPOUTPUT
-            inputGradientValues.Print("child Gradient-out");
+        gradientValues.Print("Gradient-in");
+        inputGradientValues.Print("child Gradient-in/out");
+        inputFunctionValues.Print("child Function values");
 #endif
-        }
+        Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, true, gradientValues, false, inputGradientValues);
+#if DUMPOUTPUT
+        inputGradientValues.Print("child Gradient-out");
+#endif
+    }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
@@ -1531,24 +1531,24 @@ template <class ElemType>
     }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            size_t rows0 = Input(0)->GetSampleMatrixNumRows();
-            Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-            UpdateStride(sliceInput1Value);
+    {
+        size_t rows0 = Input(0)->GetSampleMatrixNumRows();
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        UpdateStride(sliceInput1Value);
 
-            if (m_strideDim == 0)
-                SetDims(TensorShape(rows0 / GetNumParallelSequences()), HasMBLayout());
-            else
-                SetDims(Input(0)->GetSampleLayout(), HasMBLayout());
+        if (m_strideDim == 0)
+            SetDims(TensorShape(rows0 / GetNumParallelSequences()), HasMBLayout());
+        else
+            SetDims(Input(0)->GetSampleLayout(), HasMBLayout());
 
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
-            // (TODO: these following assignments are leftovers of refactoring and can be short-circuited)
-            Matrix<ElemType>& functionValues = sliceOutputValue;
-            const Matrix<ElemType>& input0 = Input(0)->Value();
-            const Matrix<ElemType>& input1 = sliceInput1Value;
+        // (TODO: these following assignments are leftovers of refactoring and can be short-circuited)
+        Matrix<ElemType>& functionValues = sliceOutputValue;
+        const Matrix<ElemType>& input0 = Input(0)->Value();
+        const Matrix<ElemType>& input1 = sliceInput1Value;
 
-            /**
+/**
             A in d x [s x T1]
             B in T1 x s
             C = A x B  in d x s, and each element is computed as 
@@ -1567,112 +1567,112 @@ template <class ElemType>
             strideDim : 0 or 1 (meaning to apply to row or column)
             */
 #if DUMPOUTPUT
-            input0.Print("StrideTimesNode - Input0");
+        input0.Print("StrideTimesNode - Input0");
 #endif
-            assert(m_strideDim == 0 || m_strideDim == 1);
-            Matrix<ElemType> mTmp1(input0.GetDeviceId());
-            Matrix<ElemType> mTmp2(input0.GetDeviceId());
-            if (m_strideDim == 1) // 1 = col stride; the example 1 case at column
+        assert(m_strideDim == 0 || m_strideDim == 1);
+        Matrix<ElemType> mTmp1(input0.GetDeviceId());
+        Matrix<ElemType> mTmp2(input0.GetDeviceId());
+        if (m_strideDim == 1) // 1 = col stride; the example 1 case at column
+        {
+            assert(m_stride == input1.GetNumCols());
+            size_t T1 = input0.GetNumCols() / m_stride;
+            assert(T1 == input1.GetNumRows());
+            size_t d = input0.GetNumRows();
+            functionValues.Resize(d, m_stride);
+            for (size_t k = 0; k < m_stride; k++)
             {
-                assert(m_stride == input1.GetNumCols());
-                size_t T1 = input0.GetNumCols() / m_stride;
-                assert(T1 == input1.GetNumRows());
-                size_t d = input0.GetNumRows();
-                functionValues.Resize(d, m_stride);
-                for (size_t k = 0; k < m_stride; k++)
+                mTmp1.Resize(d, T1);
+                for (size_t j = 0; j < T1; j++)
                 {
-                    mTmp1.Resize(d, T1);
-                    for (size_t j = 0; j < T1; j++)
-                    {
-                        mTmp1.ColumnSlice(j, 1).SetValue(input0.ColumnSlice(j * m_stride + k, 1));
-                    }
-
-                    mTmp2 = input1.ColumnSlice(k, 1);
-                    functionValues.ColumnSlice(k, 1).AssignProductOf(mTmp1, false, mTmp2, false);
+                    mTmp1.ColumnSlice(j, 1).SetValue(input0.ColumnSlice(j * m_stride + k, 1));
                 }
+
+                mTmp2 = input1.ColumnSlice(k, 1);
+                functionValues.ColumnSlice(k, 1).AssignProductOf(mTmp1, false, mTmp2, false);
             }
-            else if (m_strideDim == 0) // 0 = row stride; the example 2 case at row
+        }
+        else if (m_strideDim == 0) // 0 = row stride; the example 2 case at row
+        {
+            assert(m_stride == input1.GetNumCols());
+            size_t T1 = input0.GetNumRows() / m_stride;
+            size_t d = input1.GetNumRows();
+            assert(d == input0.GetNumCols());
+            functionValues.Resize(T1, m_stride);
+            mTmp1.Resize(d, T1);
+            for (size_t k = 0; k < m_stride; k++)
             {
-                assert(m_stride == input1.GetNumCols());
-                size_t T1 = input0.GetNumRows() / m_stride;
-                size_t d = input1.GetNumRows();
-                assert(d == input0.GetNumCols());
-                functionValues.Resize(T1, m_stride);
-                mTmp1.Resize(d, T1);
-                for (size_t k = 0; k < m_stride; k++)
+                for (size_t j = 0; j < T1; j++)
                 {
-                    for (size_t j = 0; j < T1; j++)
-                    {
-                        mTmp1.ColumnSlice(j, 1).AssignRowSliceValuesOf(input0, k + j * m_stride, 1);
-                    }
-
-                    mTmp2 = input1.ColumnSlice(k, 1);
-                    functionValues.ColumnSlice(k, 1).AssignProductOf(mTmp1, true, mTmp2, false);
+                    mTmp1.ColumnSlice(j, 1).AssignRowSliceValuesOf(input0, k + j * m_stride, 1);
                 }
+
+                mTmp2 = input1.ColumnSlice(k, 1);
+                functionValues.ColumnSlice(k, 1).AssignProductOf(mTmp1, true, mTmp2, false);
             }
+        }
 #if NANCHECK
-            functionValues.HasNan("StrideTimes");
+        functionValues.HasNan("StrideTimes");
 #endif
 #if DUMPOUTPUT
-            functionValues.Print("StrideTimesNode");
+        functionValues.Print("StrideTimesNode");
 #endif
-        }
+    }
 
-        /**
+    /**
         three inputs
         input0: left matrix
         input1: right matrix
         stridedim: single element no gradient matrix, 0 row stride / 1 column stride
         */
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
+    {
+        Base::Validate(isFinalValidationPass);
         LinkToMBLayout(Input(1)->GetMBLayout()); // retains the layout of the right input
 
-            if (Input(2)->Value().GetNumElements() != 1)
-                RuntimeError("%ls %ls operation: Input(2) should be a single element matrix and have the value 0 (row) or 1 (col).", NodeName().c_str(), OperationName().c_str());
-            m_strideDim = (size_t) Input(2)->Value().Get00Element();
-            if (m_strideDim != 0 && m_strideDim != 1)
-                RuntimeError("%ls %ls operation: Input(2) should be a single element matrix and have the value 0 (row) or 1 (col).", NodeName().c_str(), OperationName().c_str());
-            //if (Input(2)->m_needGradient)        // disabled because this is a flag that belongs to Network. Node should simply not propagate anything into it
-            //    RuntimeError("StrideTimes: No gradient update should be on input(2).");
+        if (Input(2)->Value().GetNumElements() != 1)
+            RuntimeError("%ls %ls operation: Input(2) should be a single element matrix and have the value 0 (row) or 1 (col).", NodeName().c_str(), OperationName().c_str());
+        m_strideDim = (size_t) Input(2)->Value().Get00Element();
+        if (m_strideDim != 0 && m_strideDim != 1)
+            RuntimeError("%ls %ls operation: Input(2) should be a single element matrix and have the value 0 (row) or 1 (col).", NodeName().c_str(), OperationName().c_str());
+        //if (Input(2)->m_needGradient)        // disabled because this is a flag that belongs to Network. Node should simply not propagate anything into it
+        //    RuntimeError("StrideTimes: No gradient update should be on input(2).");
 
-            size_t rows0 = Input(0)->GetSampleMatrixNumRows(), cols0 = Input(0)->GetSampleMatrixNumCols();
-            size_t rows1 = Input(1)->GetSampleMatrixNumRows();
+        size_t rows0 = Input(0)->GetSampleMatrixNumRows(), cols0 = Input(0)->GetSampleMatrixNumCols();
+        size_t rows1 = Input(1)->GetSampleMatrixNumRows();
 
-            if (m_strideDim == 0) // by row
-            {
-                if (isFinalValidationPass && rows1 != cols0)
+        if (m_strideDim == 0) // by row
+        {
+            if (isFinalValidationPass && rows1 != cols0)
                 RuntimeError("The Matrix dimension in the StrideTimes operation in dim %d does not match for cols %d in A and rows %d in B.", (int) m_strideDim, (int) cols0, (int) rows1);
-                size_t T1 = rows0 / m_stride;
-                SetDims(TensorShape(T1), HasMBLayout());
-                //after multiplication the structure is lost
-            }
+            size_t T1 = rows0 / m_stride;
+            SetDims(TensorShape(T1), HasMBLayout());
+            //after multiplication the structure is lost
+        }
 
-            else // by col
-            {
-                if (isFinalValidationPass && cols0 != rows1 * m_stride)
+        else // by col
+        {
+            if (isFinalValidationPass && cols0 != rows1 * m_stride)
                 RuntimeError("The Matrix dimension in the StrideTimes operation in dim %d does not match for cols %d in A and row number %d in B.", (int) m_strideDim, (int) cols0, (int) rows1);
-                SetDims(TensorShape(rows0), HasMBLayout());
-                //after multiplication the structure is lost
-            }
+            SetDims(TensorShape(rows0), HasMBLayout());
+            //after multiplication the structure is lost
         }
-    };
+    }
+};
 
-    template class StrideTimesNode<float>;
-    template class StrideTimesNode<double>;
+template class StrideTimesNode<float>;
+template class StrideTimesNode<double>;
 
-    // -----------------------------------------------------------------------
-    // PairNetworkNode (input)
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// PairNetworkNode (input)
+// -----------------------------------------------------------------------
 
-    /**
+/**
     pair this node to a node in another network
     this node provide an interface from this network. The next layer network then can use this interface to know which node to connect to.
     */
 template <class ElemType>
-    class PairNetworkNode : public ComputationNode<ElemType>, public NumInputs<1>
-    {
+class PairNetworkNode : public ComputationNode<ElemType>, public NumInputs<1>
+{
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
@@ -1680,38 +1680,38 @@ template <class ElemType>
         return L"PairNetwork";
     }
 
-        void Init(size_t row_size, size_t /*col_size*/)
-        {
-            CreateMatrixIfNull(m_value);
-            SetDims(TensorShape(row_size), HasMBLayout());
-            UpdateFunctionValuesSize();
-        }
+    void Init(size_t row_size, size_t /*col_size*/)
+    {
+        CreateMatrixIfNull(m_value);
+        SetDims(TensorShape(row_size), HasMBLayout());
+        UpdateFunctionValuesSize();
+    }
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(PairNetworkNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(PairNetworkNode);
     PairNetworkNode(DEVICEID_TYPE deviceId, const wstring& name, size_t row_size = 1, size_t col_size = 1)
         : Base(deviceId, name)
-        {
-            Init(row_size, col_size);
-            CreateMatrixIfNull(m_gradient);
-            m_gradient->Resize(row_size, col_size);
-            m_gradient->SetValue(0.0f);
-        }
+    {
+        Init(row_size, col_size);
+        CreateMatrixIfNull(m_gradient);
+        m_gradient->Resize(row_size, col_size);
+        m_gradient->SetValue(0.0f);
+    }
 
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            Init(1, 1); // TODO: this looks wrong; should the dimension not come from the loaded model data?
-            Base::Load(fstream, modelVersion);
-        }
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Init(1, 1); // TODO: this looks wrong; should the dimension not come from the loaded model data?
+        Base::Load(fstream, modelVersion);
+    }
 
-        /// to-do: need to change to the new way of resetting state
-        void BackpropToMap(const size_t inputIndex)
-        {
-            if (inputIndex > 0)
-                InvalidArgument("PairNetwork operation only takes one input.");
+    /// to-do: need to change to the new way of resetting state
+    void BackpropToMap(const size_t inputIndex)
+    {
+        if (inputIndex > 0)
+            InvalidArgument("PairNetwork operation only takes one input.");
 
-            Matrix<ElemType>::ScaleAndAdd(1.0, Gradient(), Input(inputIndex)->Gradient());
-        }
+        Matrix<ElemType>::ScaleAndAdd(1.0, Gradient(), Input(inputIndex)->Gradient());
+    }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
@@ -1719,13 +1719,13 @@ template <class ElemType>
         {
             BackpropToMap(inputIndex);
             return;
-        }                                                // TODO: remove these one by one
+        }                                                            // TODO: remove these one by one
         assert(GetSampleMatrixNumRows() == Gradient().GetNumRows()); // original used m_value->GetNumRows() for loop dimension
-            assert(m_pMBLayout);
+        assert(m_pMBLayout);
 
-            Matrix<ElemType> mTmp = Input(inputIndex)->GradientFor(fr);
-            Matrix<ElemType>::ScaleAndAdd(1.0, GradientFor(fr), mTmp);
-        }
+        Matrix<ElemType> mTmp = Input(inputIndex)->GradientFor(fr);
+        Matrix<ElemType>::ScaleAndAdd(1.0, GradientFor(fr), mTmp);
+    }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
@@ -1737,29 +1737,29 @@ template <class ElemType>
     }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-        {
-            Matrix<ElemType> mTmp = ValueFor(fr);
-            mTmp.SetValue(Input(0)->ValueFor(fr));
-        }
+    {
+        Matrix<ElemType> mTmp = ValueFor(fr);
+        mTmp.SetValue(Input(0)->ValueFor(fr));
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            SetDims(Input(0));
-        }
-    };
+        SetDims(Input(0));
+    }
+};
 
-    template class PairNetworkNode<float>;
-    template class PairNetworkNode<double>;
+template class PairNetworkNode<float>;
+template class PairNetworkNode<double>;
 
-    // -----------------------------------------------------------------------
-    // ParallelNode (input0, input1)
-    // TODO: How is this different from RowStack?
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// ParallelNode (input0, input1)
+// TODO: How is this different from RowStack?
+// -----------------------------------------------------------------------
 
-    /**
+/**
     parallel node to join two streams into one 
     
     join parallel children node, avoids any operations except putting outputs from children to corresponding columns
@@ -1767,185 +1767,186 @@ template <class ElemType>
     input(1) : [nDim1 X T]
     output   : [[nDim0 + nDim1] X T]
     */
-    template<class ElemType>
-    class ParallelNode : public ComputationNodeNonLooping/*ComputationNode*/<ElemType>, public NumInputs<2>
-    {
-        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"Parallel"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(ParallelNode);
-        ParallelNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            Base(deviceId, name)
-        { }
-
-        virtual void BackpropToNonLooping(size_t inputIndex) override
-        {
-            if (inputIndex > 1)
-                InvalidArgument("Parallel operation only takes two input.");
-            ComputationNodePtr child = Input(inputIndex);
-            size_t startidx = (inputIndex == 0) ? 0 : Input(0)->GetSampleMatrixNumRows();
-            size_t nrows = child->GetSampleMatrixNumRows();
-
-            // TODO: why is this needed? If it is, it should be solved more centrally.
-            if (child->Gradient().GetNumRows() != child->GetSampleMatrixNumRows() || child->Gradient().GetNumCols() != GetSampleMatrixNumCols())
-            {
-                child->Gradient().Resize(child->GetSampleMatrixNumRows(), child->GetSampleMatrixNumCols());
-                child->Gradient().SetValue(0);
-            }
-
-            Matrix<ElemType> tmpMat(m_deviceId);
-            tmpMat.AssignRowSliceValuesOf(Gradient(), startidx, nrows);
-
-            BackpropToS(tmpMat, child->Gradient());
-        }
-
-        virtual bool OutputUsedInComputingInputNodesGradients() const override
-        {
-            // The ParallelNode does not require its output value for computing
-            // the gradients of its input nodes
-            return false;
-        }
+template <class ElemType>
+class ParallelNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<2>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"Parallel";
+    }
 
-        virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-        {
-            // The ParallelNode does not require any of it's input's values for computing
-            // the gradients of its input nodes
-            UNREFERENCED_PARAMETER(childIndex);
-            return false;
-        }
+public:
+    DeclareConstructorFromConfigWithNumInputs(ParallelNode);
+    ParallelNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
 
-        /*TODO: merge with call site*/void BackpropToS(Matrix<ElemType>& gradientValues, Matrix<ElemType>& inputGradientValues)
-        {
-            inputGradientValues += gradientValues;
-        }
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        if (inputIndex > 1)
+            InvalidArgument("Parallel operation only takes two input.");
+        ComputationNodePtr child = Input(inputIndex);
+        size_t startidx = (inputIndex == 0) ? 0 : Input(0)->GetSampleMatrixNumRows();
+        size_t nrows = child->GetSampleMatrixNumRows();
 
-        virtual void /*ComputationNodeNonLooping::*/ForwardPropNonLooping() override
+        // TODO: why is this needed? If it is, it should be solved more centrally.
+        if (child->Gradient().GetNumRows() != child->GetSampleMatrixNumRows() || child->Gradient().GetNumCols() != GetSampleMatrixNumCols())
         {
-            ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+            child->Gradient().Resize(child->GetSampleMatrixNumRows(), child->GetSampleMatrixNumCols());
+            child->Gradient().SetValue(0);
         }
 
-        /*TODO: merge with call site*/void ForwardPropS(Matrix<ElemType>& functionValues, Matrix<ElemType>& inputFunctionValues0, Matrix<ElemType>& inputFunctionValues1)
-        {
-            size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
-            size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
-
-            if (cols0 != cols1)
-                LogicError("ParallelNode: column dimension mismatched!");
+        Matrix<ElemType> tmpMat(m_deviceId);
+        tmpMat.AssignRowSliceValuesOf(Gradient(), startidx, nrows);
 
-            functionValues.Resize(rows0 + rows1, cols0);
-            functionValues.SetValue(0);
-
-            functionValues.AssignToRowSliceValuesOf(inputFunctionValues0, 0, rows0);
-            functionValues.AssignToRowSliceValuesOf(inputFunctionValues1, rows0, rows1);
-        }
+        BackpropToS(tmpMat, child->Gradient());
+    }
 
-        /// input(0) : [nDim1 X T]
-        /// input(1) : [nDim2 X T]
-        /// output   : [[nDim1 + nDim2] X T]
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ParallelNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
 
-            size_t rows1 = Input(1)->GetSampleMatrixNumRows();
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        // The ParallelNode does not require any of it's input's values for computing
+        // the gradients of its input nodes
+        UNREFERENCED_PARAMETER(childIndex);
+        return false;
+    }
 
-            size_t rows0 = Input(0)->GetSampleMatrixNumRows();
+    /*TODO: merge with call site*/ void BackpropToS(Matrix<ElemType>& gradientValues, Matrix<ElemType>& inputGradientValues)
+    {
+        inputGradientValues += gradientValues;
+    }
 
-            size_t rows = rows0 + rows1;
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
+    {
+        ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value());
+    }
 
-            SetDims(TensorShape(rows), HasMBLayout());
-            m_sampleLayout = GetInputSampleLayout(0);
-            // BUGBUG: Inconsistent with 'rows'
-        }
+    /*TODO: merge with call site*/ void ForwardPropS(Matrix<ElemType>& functionValues, Matrix<ElemType>& inputFunctionValues0, Matrix<ElemType>& inputFunctionValues1)
+    {
+        size_t rows0 = inputFunctionValues0.GetNumRows(), cols0 = inputFunctionValues0.GetNumCols();
+        size_t rows1 = inputFunctionValues1.GetNumRows(), cols1 = inputFunctionValues1.GetNumCols();
 
-    public:
-        virtual bool UnitTest() {
-            size_t nT = 3;
-            size_t nInput0 = 3;
-            size_t nInput1 = 3;
+        if (cols0 != cols1)
+            LogicError("ParallelNode: column dimension mismatched!");
 
-            Matrix<ElemType> f0(m_deviceId), func(m_deviceId), f1(m_deviceId);
+        functionValues.Resize(rows0 + rows1, cols0);
+        functionValues.SetValue(0);
 
-            f0 = Input(0)->Value();
-            f1 = Input(1)->Value();
-            func = Value();
+        functionValues.AssignToRowSliceValuesOf(inputFunctionValues0, 0, rows0);
+        functionValues.AssignToRowSliceValuesOf(inputFunctionValues1, rows0, rows1);
+    }
 
-            Input(0)->SetDims1(nInput0, nT);
-            Input(0)->UpdateFunctionValuesSize();
-            Input(0)->Value().SetValue(0);
-            Input(0)->Value()(0, 0) = 1;
-            Input(0)->Value()(0, 1) = 2;
-            Input(0)->Value()(0, 2) = 3;
-
-            Input(1)->SetDims1(nInput1, nT);
-            Input(1)->UpdateFunctionValuesSize();
-            Input(1)->Value().SetValue(0);
-            Input(1)->Value()(0, 0) = 4;
-            Input(1)->Value()(0, 1) = 5;
-            Input(1)->Value()(0, 2) = 6;
-            SetDims1(nInput0 + nInput1, nT);
-            UpdateFunctionValuesSize();
+    /// input(0) : [nDim1 X T]
+    /// input(1) : [nDim2 X T]
+    /// output   : [[nDim1 + nDim2] X T]
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            ForwardProp(FrameRange(m_pMBLayout));
+        size_t rows1 = Input(1)->GetSampleMatrixNumRows();
 
-            /// check with expected values
-            if (!ISCLOSE(Value()(0, 0), 1, EPSILON) ||
-                !ISCLOSE(Value()(0, 1), 2, EPSILON) ||
-                !ISCLOSE(Value()(0, 2), 3, EPSILON) ||
-                !ISCLOSE(Value()(3, 0), 4, EPSILON) ||
-                !ISCLOSE(Value()(3, 1), 5, EPSILON) ||
-                !ISCLOSE(Value()(3, 2), 6, EPSILON))
-                return false;
-            Value().TransferToDeviceIfNotThere(m_deviceId, true);
+        size_t rows0 = Input(0)->GetSampleMatrixNumRows();
 
-            Gradient().Resize(nInput0 + nInput1, nT);
-            Gradient().SetValue(0);
-            Input(0)->Gradient().Resize(nInput0, nT);
-            Input(1)->Gradient().Resize(nInput1, nT);
-            Input(0)->Gradient().SetValue(0);
-            Input(1)->Gradient().SetValue(0);
-            Gradient()(0, 0) = 1;
-            Gradient()(0, 1) = 2;
-            Gradient()(0, 2) = 3;
-            Gradient()(3, 0) = 4;
-            Gradient()(3, 1) = 5;
-            Gradient()(3, 2) = 6;
-
-            BackpropTo(0, FrameRange(m_pMBLayout));
-            BackpropTo(1, FrameRange(m_pMBLayout));
-
-            /// check with expected values
-            if (!ISCLOSE(Input(0)->Gradient()(0, 0), 1, EPSILON)
-                || !ISCLOSE(Input(0)->Gradient()(0, 1), 2, EPSILON)
-                || !ISCLOSE(Input(0)->Gradient()(0, 2), 3, EPSILON)
-                || !ISCLOSE(Input(1)->Gradient()(0, 0), 4, EPSILON)
-                || !ISCLOSE(Input(1)->Gradient()(0, 1), 5, EPSILON)
-                || !ISCLOSE(Input(1)->Gradient()(0, 2), 6, EPSILON))
-                return false;
-
-            Input(0)->Gradient().TransferToDeviceIfNotThere( m_deviceId, true);
-            Input(1)->Gradient().TransferToDeviceIfNotThere( m_deviceId, true);
-
-            return true;
-        }
+        size_t rows = rows0 + rows1;
 
-    };
+        SetDims(TensorShape(rows), HasMBLayout());
+        m_sampleLayout = GetInputSampleLayout(0);
+        // BUGBUG: Inconsistent with 'rows'
+    }
 
-    template class ParallelNode<float>;
-    template class ParallelNode<double>;
+public:
+    virtual bool UnitTest()
+    {
+        size_t nT = 3;
+        size_t nInput0 = 3;
+        size_t nInput1 = 3;
+
+        Matrix<ElemType> f0(m_deviceId), func(m_deviceId), f1(m_deviceId);
+
+        f0 = Input(0)->Value();
+        f1 = Input(1)->Value();
+        func = Value();
+
+        Input(0)->SetDims1(nInput0, nT);
+        Input(0)->UpdateFunctionValuesSize();
+        Input(0)->Value().SetValue(0);
+        Input(0)->Value()(0, 0) = 1;
+        Input(0)->Value()(0, 1) = 2;
+        Input(0)->Value()(0, 2) = 3;
+
+        Input(1)->SetDims1(nInput1, nT);
+        Input(1)->UpdateFunctionValuesSize();
+        Input(1)->Value().SetValue(0);
+        Input(1)->Value()(0, 0) = 4;
+        Input(1)->Value()(0, 1) = 5;
+        Input(1)->Value()(0, 2) = 6;
+        SetDims1(nInput0 + nInput1, nT);
+        UpdateFunctionValuesSize();
+
+        ForwardProp(FrameRange(m_pMBLayout));
+
+        /// check with expected values
+        if (!ISCLOSE(Value()(0, 0), 1, EPSILON) ||
+            !ISCLOSE(Value()(0, 1), 2, EPSILON) ||
+            !ISCLOSE(Value()(0, 2), 3, EPSILON) ||
+            !ISCLOSE(Value()(3, 0), 4, EPSILON) ||
+            !ISCLOSE(Value()(3, 1), 5, EPSILON) ||
+            !ISCLOSE(Value()(3, 2), 6, EPSILON))
+            return false;
+        Value().TransferToDeviceIfNotThere(m_deviceId, true);
+
+        Gradient().Resize(nInput0 + nInput1, nT);
+        Gradient().SetValue(0);
+        Input(0)->Gradient().Resize(nInput0, nT);
+        Input(1)->Gradient().Resize(nInput1, nT);
+        Input(0)->Gradient().SetValue(0);
+        Input(1)->Gradient().SetValue(0);
+        Gradient()(0, 0) = 1;
+        Gradient()(0, 1) = 2;
+        Gradient()(0, 2) = 3;
+        Gradient()(3, 0) = 4;
+        Gradient()(3, 1) = 5;
+        Gradient()(3, 2) = 6;
+
+        BackpropTo(0, FrameRange(m_pMBLayout));
+        BackpropTo(1, FrameRange(m_pMBLayout));
+
+        /// check with expected values
+        if (!ISCLOSE(Input(0)->Gradient()(0, 0), 1, EPSILON) || !ISCLOSE(Input(0)->Gradient()(0, 1), 2, EPSILON) || !ISCLOSE(Input(0)->Gradient()(0, 2), 3, EPSILON) || !ISCLOSE(Input(1)->Gradient()(0, 0), 4, EPSILON) || !ISCLOSE(Input(1)->Gradient()(0, 1), 5, EPSILON) || !ISCLOSE(Input(1)->Gradient()(0, 2), 6, EPSILON))
+            return false;
 
-    // -----------------------------------------------------------------------
-    // LSTMNode (obs, inputGate, forgetGate, outputGate, memoryCellWgt)
-    // deprecated early implementation of LSTM operating on minibatches directly
-    //  - input(0) : child with dimension [inputdim x T]
-    //  - input(1) : input gate [outputdim x [inputdim + outputdim + 2]] bi, Wxi, Whi, Wci
-    //  - input(2) : forget gate [outputdim x [inputdim + outputdim + 2]] for bf, Wxf, Whf, Wcf
-    //  - input(3) : output gate [outputdim x [inputdim + outputdim + 2]] for bo, Wxo, Who, and Wco
-    //  - input(4) : memory cell weight [outputdim x [inputdim + outputdim + 1]] for bc, Wxc, and Whc 
-    //  - output : dimension [outputdim x T]
-    // -----------------------------------------------------------------------
+        Input(0)->Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
+        Input(1)->Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
 
-    /**
+        return true;
+    }
+};
+
+template class ParallelNode<float>;
+template class ParallelNode<double>;
+
+// -----------------------------------------------------------------------
+// LSTMNode (obs, inputGate, forgetGate, outputGate, memoryCellWgt)
+// deprecated early implementation of LSTM operating on minibatches directly
+//  - input(0) : child with dimension [inputdim x T]
+//  - input(1) : input gate [outputdim x [inputdim + outputdim + 2]] bi, Wxi, Whi, Wci
+//  - input(2) : forget gate [outputdim x [inputdim + outputdim + 2]] for bf, Wxf, Whf, Wcf
+//  - input(3) : output gate [outputdim x [inputdim + outputdim + 2]] for bo, Wxo, Who, and Wco
+//  - input(4) : memory cell weight [outputdim x [inputdim + outputdim + 1]] for bc, Wxc, and Whc
+//  - output : dimension [outputdim x T]
+// -----------------------------------------------------------------------
+
+/**
     LSTM specific node. This node uses matrix operations to have LSTM functionality. 
     It avoids using general recurrent loop operations in the network operations in ComputationNetwork.
 
@@ -1963,23 +1964,23 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         return L"LSTM";
     }
 
-        // BUGBUG: These flags no longer exist outside. I moved this here to make it compile, but this node is no longer functional.
+    // BUGBUG: These flags no longer exist outside. I moved this here to make it compile, but this node is no longer functional.
     enum class MinibatchPackingFlags : char // (note: not using unsigned char because these go into a matrix, and we use Matrix<char>, since we use it as a data holder)
-        {
-            None = 0,
+    {
+        None = 0,
         SequenceStart = 1 << 0, // binary 0001  frame is first of an utterance
         SequenceEnd = 1 << 1,   // binary 0010  frame is last of an utterance
         NoFeature = 1 << 2,     // binary 0100  frame has no feature (e.g. a gap due to BPTT)
         NoLabel = 1 << 3,       // binary 1000  frame has no label
 
         NoInput = NoFeature | NoLabel, // Note: Once we refactorized the reader, NoInput will no longer needed.
-            SequenceStartOrNoFeature = SequenceStart | NoFeature,
-            SequenceEndOrNoFeature = SequenceEnd | NoFeature,
-            SequenceStartOrEndOrNoFeature = SequenceStart | SequenceEnd | NoFeature,
-        };
+        SequenceStartOrNoFeature = SequenceStart | NoFeature,
+        SequenceEndOrNoFeature = SequenceEnd | NoFeature,
+        SequenceStartOrEndOrNoFeature = SequenceStart | SequenceEnd | NoFeature,
+    };
 
-    public:
-        DeclareConstructorFromConfigWithNumInputs(LSTMNode);
+public:
+    DeclareConstructorFromConfigWithNumInputs(LSTMNode);
     LSTMNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name),
           m_State(deviceId),
@@ -1998,7 +1999,7 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
           m_tempMatrix(deviceId),
           mSlicePrevState(deviceId),
           mSlicePrevOutput(deviceId),
-            grdBeforeInputGate(deviceId),
+          grdBeforeInputGate(deviceId),
           grdBeforeForget(deviceId),
           grdBeforeGo(deviceId),
           grdToCell(deviceId),
@@ -2007,72 +2008,72 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
           m_state_error_from_future_minibatch(deviceId),
           mLastState(deviceId),
           mLastOutput(deviceId),
-            m_inputDim(0),
-            m_outputDim(0),
-            m_use_errors_from_future_minibatch(false),
+          m_inputDim(0),
+          m_outputDim(0),
+          m_use_errors_from_future_minibatch(false),
           m_DefaultState((ElemType) DEFAULT_HIDDEN_ACTIVATION)
-        {
-        }
+    {
+    }
 
-        virtual void Save(File& fstream) const override
-        {
-            Base::Save(fstream);
-            fstream << m_inputDim << m_outputDim;
-            fstream << m_DefaultState;
-        }
+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_inputDim << m_outputDim;
+        fstream << m_DefaultState;
+    }
 
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            Base::Load(fstream, modelVersion);
-            if (modelVersion == 2)
-                fstream >> m_inputDim >> m_outputDim;
-            fstream >> m_DefaultState;
-        }
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        if (modelVersion == 2)
+            fstream >> m_inputDim >> m_outputDim;
+        fstream >> m_DefaultState;
+    }
 
-        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
         {
-            Base::CopyTo(nodeP, newName, flags);
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                auto node = dynamic_pointer_cast<LSTMNode<ElemType>>(nodeP);
-                node->m_inputDim = m_inputDim;
-                node->m_outputDim = m_outputDim;
+            auto node = dynamic_pointer_cast<LSTMNode<ElemType>>(nodeP);
+            node->m_inputDim = m_inputDim;
+            node->m_outputDim = m_outputDim;
 
             node->m_State = m_State;           // hidden state activity
             node->m_PastState = m_PastState;   // state activity in the previous minibatch
-                node->m_PastOutput = m_PastOutput; // output in the previou minibatch 
+            node->m_PastOutput = m_PastOutput; // output in the previou minibatch
 
             node->m_Gi = m_Gi; // input gate activity
             node->m_Gf = m_Gf; // forget gate activity
             node->m_Go = m_Go; // output gate activity
 
-                node->mSlicePrevOutput = mSlicePrevOutput;
-                node->mSlicePrevState = mSlicePrevState;
+            node->mSlicePrevOutput = mSlicePrevOutput;
+            node->mSlicePrevState = mSlicePrevState;
 
-                node->m_use_errors_from_future_minibatch = m_use_errors_from_future_minibatch;
+            node->m_use_errors_from_future_minibatch = m_use_errors_from_future_minibatch;
 
-                node->m_DefaultState = m_DefaultState;
-            }
+            node->m_DefaultState = m_DefaultState;
         }
+    }
 
-        virtual void BackpropToNonLooping(size_t inputIndex) override
-        {
-            if (inputIndex > 4)
-                InvalidArgument("LSTM operation only takes five inputs.");
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        if (inputIndex > 4)
+            InvalidArgument("LSTM operation only takes five inputs.");
 
-            size_t nT = Input(0)->GetSampleMatrixNumCols();
-            size_t inputDim = Input(0)->GetSampleMatrixNumRows();
-            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
+        size_t nT = Input(0)->GetSampleMatrixNumCols();
+        size_t inputDim = Input(0)->GetSampleMatrixNumRows();
+        size_t outputDim = Input(1)->GetSampleMatrixNumRows();
 
-            if (m_GradientComputed == false)
+        if (m_GradientComputed == false)
+        {
+            if (GetSampleMatrixNumCols() != Gradient().GetNumCols() ||
+                GetSampleMatrixNumRows() != Gradient().GetNumRows())
             {
-                if (GetSampleMatrixNumCols() != Gradient().GetNumCols() ||
-                    GetSampleMatrixNumRows() != Gradient().GetNumRows())
-                {
-                    RuntimeError("LSTMNode::GradientValue size doesn't match to the function value size");
-                }
+                RuntimeError("LSTMNode::GradientValue size doesn't match to the function value size");
+            }
 
-                // reset gradients
+            // reset gradients
             grdToObs.Resize(inputDim, nT);
             grdToObs.SetValue(0);
             grdToInputGate.Resize(Input(1)->GetSampleMatrixNumRows(), Input(1)->GetSampleMatrixNumCols());
@@ -2084,169 +2085,169 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
             grdToCellWgt.Resize(Input(4)->GetSampleMatrixNumRows(), Input(4)->GetSampleMatrixNumCols());
             grdToCellWgt.SetValue(0);
 
-                Matrix<ElemType> slicePrevOutput(m_deviceId), slicePrevState(m_deviceId);
-                Matrix<ElemType> grdToPrevOutput(m_deviceId), grdToPrevState(m_deviceId);
-                Matrix<ElemType> stateError(m_deviceId);
-                slicePrevState.Resize(outputDim, GetNumParallelSequences());
-                slicePrevOutput.Resize(outputDim, GetNumParallelSequences());
-                slicePrevOutput.SetValue(0);
+            Matrix<ElemType> slicePrevOutput(m_deviceId), slicePrevState(m_deviceId);
+            Matrix<ElemType> grdToPrevOutput(m_deviceId), grdToPrevState(m_deviceId);
+            Matrix<ElemType> stateError(m_deviceId);
+            slicePrevState.Resize(outputDim, GetNumParallelSequences());
+            slicePrevOutput.Resize(outputDim, GetNumParallelSequences());
+            slicePrevOutput.SetValue(0);
 
-                stateError.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
+            stateError.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
 
-                grdToPrevOutput.Resize(slicePrevOutput.GetNumRows(), slicePrevOutput.GetNumCols());
-                grdToPrevState.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
-                grdToPrevOutput.SetValue(0);
-                grdToPrevState.SetValue(0);
+            grdToPrevOutput.Resize(slicePrevOutput.GetNumRows(), slicePrevOutput.GetNumCols());
+            grdToPrevState.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
+            grdToPrevOutput.SetValue(0);
+            grdToPrevState.SetValue(0);
 
-                for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
-                {
-                    FrameRange fr(m_pMBLayout, timeIdxInSeq);
-                    Matrix<ElemType> sliceObs = Input(0)->ValueFor(fr);
-                    Matrix<ElemType> sliceOutput = ValueFor(fr);
-                    Matrix<ElemType> sliceState = DataFor(m_State, fr);
+            for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
+            {
+                FrameRange fr(m_pMBLayout, timeIdxInSeq);
+                Matrix<ElemType> sliceObs = Input(0)->ValueFor(fr);
+                Matrix<ElemType> sliceOutput = ValueFor(fr);
+                Matrix<ElemType> sliceState = DataFor(m_State, fr);
 
-                    Matrix<ElemType> sliceGi = DataFor(m_Gi, fr);
-                    Matrix<ElemType> sliceGf = DataFor(m_Gf, fr);
-                    Matrix<ElemType> sliceGo = DataFor(m_Go, fr);
+                Matrix<ElemType> sliceGi = DataFor(m_Gi, fr);
+                Matrix<ElemType> sliceGf = DataFor(m_Gf, fr);
+                Matrix<ElemType> sliceGo = DataFor(m_Go, fr);
 
-                    Matrix<ElemType> sliceTanhState = DataFor(tanhState, fr);
-                    Matrix<ElemType> sliceTanhObs = DataFor(tanhObs, fr);
+                Matrix<ElemType> sliceTanhState = DataFor(tanhState, fr);
+                Matrix<ElemType> sliceTanhObs = DataFor(tanhObs, fr);
 
-                    Matrix<ElemType> error = GradientFor(fr);
+                Matrix<ElemType> error = GradientFor(fr);
 
-                    Matrix<ElemType> grdToObsSlice(this->m_deviceId);
+                Matrix<ElemType> grdToObsSlice(this->m_deviceId);
 
 #ifdef DEBUG_DECODER
-                    fprintf(stderr, "original output error [%ld] norm = %.8e\n", timeIdxInSeq, error.FrobeniusNorm());
+                fprintf(stderr, "original output error [%ld] norm = %.8e\n", timeIdxInSeq, error.FrobeniusNorm());
 #endif
 
-                    PrepareThisErrorsBeforeBackProp(timeIdxInSeq, nT, error, stateError, grdToPrevOutput, grdToPrevState,
+                PrepareThisErrorsBeforeBackProp(timeIdxInSeq, nT, error, stateError, grdToPrevOutput, grdToPrevState,
                                                 m_obs_error_from_future_minibatch, m_state_error_from_future_minibatch, GetNumParallelSequences(), nullptr /*&m_pMBLayout->GetM()*/ /*BUGBUG: no longer functional*/);
 
 #ifdef DEBUG_DECODER
-                    fprintf(stderr, "output error [%ld] norm = %.8e\n", timeIdxInSeq, error.FrobeniusNorm());
-                    fprintf(stderr, "state error [%ld] norm = %.8e\n", timeIdxInSeq, stateError.FrobeniusNorm());
+                fprintf(stderr, "output error [%ld] norm = %.8e\n", timeIdxInSeq, error.FrobeniusNorm());
+                fprintf(stderr, "state error [%ld] norm = %.8e\n", timeIdxInSeq, stateError.FrobeniusNorm());
 #endif
 
-                    grdToPrevOutput.Resize(slicePrevOutput.GetNumRows(), slicePrevOutput.GetNumCols());
-                    grdToPrevState.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
-                    grdToPrevOutput.SetValue(0);
-                    grdToPrevState.SetValue(0);
+                grdToPrevOutput.Resize(slicePrevOutput.GetNumRows(), slicePrevOutput.GetNumCols());
+                grdToPrevState.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
+                grdToPrevOutput.SetValue(0);
+                grdToPrevState.SetValue(0);
 
                 PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, Value(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, nullptr /*&m_pMBLayout->GetM()*/ /*BUGBUG: no longer functional*/);
 
-                    ComputeInputGradientWrtGates(
-                        error,
-                        sliceObs,
-                        grdToObsSlice,
-                        Input(1)->Value(),
-                        grdToInputGate,
-                        Input(2)->Value(),
-                        grdToForgetGate,
-                        Input(3)->Value(),
-                        grdToOutputGate,
-                        Input(4)->Value(),
-                        grdToCellWgt,
-                        mSlicePrevOutput,
-                        mSlicePrevState,
-                        stateError,
-                        sliceState,
-                        sliceTanhState,
-                        sliceTanhObs,
-                        sliceGi,
-                        sliceGf,
-                        sliceGo,
-                        grdToPrevOutput,
-                        grdToPrevState,
+                ComputeInputGradientWrtGates(
+                    error,
+                    sliceObs,
+                    grdToObsSlice,
+                    Input(1)->Value(),
+                    grdToInputGate,
+                    Input(2)->Value(),
+                    grdToForgetGate,
+                    Input(3)->Value(),
+                    grdToOutputGate,
+                    Input(4)->Value(),
+                    grdToCellWgt,
+                    mSlicePrevOutput,
+                    mSlicePrevState,
+                    stateError,
+                    sliceState,
+                    sliceTanhState,
+                    sliceTanhObs,
+                    sliceGi,
+                    sliceGf,
+                    sliceGo,
+                    grdToPrevOutput,
+                    grdToPrevState,
                     m_tempMatrix);
-                    DataFor(grdToObs, fr).SetValue(grdToObsSlice);
+                DataFor(grdToObs, fr).SetValue(grdToObsSlice);
 
                 PrepareErrors(timeIdxInSeq, grdToPrevOutput, grdToPrevState, GetNumParallelSequences(), nullptr /*&m_pMBLayout->GetM()*/ /*BUGBUG: no longer functional*/);
-                }
+            }
 #ifdef DEBUG_DECODER
-                fprintf(stderr, "after error prop b_c norm = %.8e\n", Input(4)->Value().ColumnSlice(0, 1).FrobeniusNorm());
+            fprintf(stderr, "after error prop b_c norm = %.8e\n", Input(4)->Value().ColumnSlice(0, 1).FrobeniusNorm());
 #endif
-                m_obs_error_from_future_minibatch = grdToPrevOutput;
-                m_state_error_from_future_minibatch = grdToPrevState;
+            m_obs_error_from_future_minibatch = grdToPrevOutput;
+            m_state_error_from_future_minibatch = grdToPrevState;
 
 #ifdef DEBUG_DECODER
-                fprintf(stderr, "pass error to encoder error = %.4e state error = %.4e\n", m_obs_error_from_future_minibatch.FrobeniusNorm(), m_state_error_from_future_minibatch.FrobeniusNorm());
+            fprintf(stderr, "pass error to encoder error = %.4e state error = %.4e\n", m_obs_error_from_future_minibatch.FrobeniusNorm(), m_state_error_from_future_minibatch.FrobeniusNorm());
 #endif
-                m_GradientComputed = true;
-            }
+            m_GradientComputed = true;
+        }
 
         if (inputIndex == 0) //derivative with regard to the observation
-            {
-                if (Input(inputIndex)->Gradient().HasNoElements())
-                    Input(inputIndex)->Gradient().SetValue(grdToObs);
-                else
-                    Input(inputIndex)->Gradient() += grdToObs;
-            }
+        {
+            if (Input(inputIndex)->Gradient().HasNoElements())
+                Input(inputIndex)->Gradient().SetValue(grdToObs);
+            else
+                Input(inputIndex)->Gradient() += grdToObs;
+        }
 
-            if (inputIndex == 1)
-            {
-                if (Input(inputIndex)->Gradient().HasNoElements())
-                    Input(inputIndex)->Gradient().SetValue(grdToInputGate);
-                else
-                    Input(inputIndex)->Gradient() += grdToInputGate;
-            }
+        if (inputIndex == 1)
+        {
+            if (Input(inputIndex)->Gradient().HasNoElements())
+                Input(inputIndex)->Gradient().SetValue(grdToInputGate);
+            else
+                Input(inputIndex)->Gradient() += grdToInputGate;
+        }
 
-            if (inputIndex == 2)
-            {
-                if (Input(inputIndex)->Gradient().HasNoElements())
-                    Input(inputIndex)->Gradient().SetValue(grdToForgetGate);
-                else
-                    Input(inputIndex)->Gradient() += grdToForgetGate;
-            }
+        if (inputIndex == 2)
+        {
+            if (Input(inputIndex)->Gradient().HasNoElements())
+                Input(inputIndex)->Gradient().SetValue(grdToForgetGate);
+            else
+                Input(inputIndex)->Gradient() += grdToForgetGate;
+        }
 
-            if (inputIndex == 3)
-            {
-                if (Input(inputIndex)->Gradient().HasNoElements())
-                    Input(inputIndex)->Gradient().SetValue(grdToOutputGate);
-                else
-                    Input(inputIndex)->Gradient() += grdToOutputGate;
-            }
+        if (inputIndex == 3)
+        {
+            if (Input(inputIndex)->Gradient().HasNoElements())
+                Input(inputIndex)->Gradient().SetValue(grdToOutputGate);
+            else
+                Input(inputIndex)->Gradient() += grdToOutputGate;
+        }
 
-            if (inputIndex == 4)
-            {
-                if (Input(inputIndex)->Gradient().HasNoElements())
-                    Input(inputIndex)->Gradient().SetValue(grdToCellWgt);
-                else
-                    Input(inputIndex)->Gradient() += grdToCellWgt;
-            }
+        if (inputIndex == 4)
+        {
+            if (Input(inputIndex)->Gradient().HasNoElements())
+                Input(inputIndex)->Gradient().SetValue(grdToCellWgt);
+            else
+                Input(inputIndex)->Gradient() += grdToCellWgt;
+        }
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "LSTM gradient[%d] norm = %.8e\n", inputIndex, Input(inputIndex)->Gradient().FrobeniusNorm());
+        fprintf(stderr, "LSTM gradient[%d] norm = %.8e\n", inputIndex, Input(inputIndex)->Gradient().FrobeniusNorm());
 #endif
-        }
+    }
 
-        static void WINAPI GradientOfTanh(const Matrix<ElemType>& functionValues,
-            const Matrix<ElemType>& gradientOut,
-            Matrix<ElemType>& inputGradientValues,
-            Matrix<ElemType>& extTmp)
-        {
-            Matrix<ElemType> mTmp(inputGradientValues.GetDeviceId());
-            extTmp.AssignElementProductOf(functionValues, functionValues); // v .* v
+    static void WINAPI GradientOfTanh(const Matrix<ElemType>& functionValues,
+                                      const Matrix<ElemType>& gradientOut,
+                                      Matrix<ElemType>& inputGradientValues,
+                                      Matrix<ElemType>& extTmp)
+    {
+        Matrix<ElemType> mTmp(inputGradientValues.GetDeviceId());
+        extTmp.AssignElementProductOf(functionValues, functionValues); // v .* v
         mTmp.AssignDifferenceOf(1, extTmp);                            // 1-v^2
-            if (inputGradientValues.GetNumRows() != functionValues.GetNumRows() ||
-                inputGradientValues.GetNumCols() != functionValues.GetNumCols())
-                LogicError("LSTMNode::GradientOfTanh : inputGradientValues need to be pre-allocated!");
-            inputGradientValues.AddElementProductOf(gradientOut, mTmp); //  d .* ((1-v) .* v))
-        }
+        if (inputGradientValues.GetNumRows() != functionValues.GetNumRows() ||
+            inputGradientValues.GetNumCols() != functionValues.GetNumCols())
+            LogicError("LSTMNode::GradientOfTanh : inputGradientValues need to be pre-allocated!");
+        inputGradientValues.AddElementProductOf(gradientOut, mTmp); //  d .* ((1-v) .* v))
+    }
 
-        static void WINAPI ComputeInputGradientWrtGates(
+    static void WINAPI ComputeInputGradientWrtGates(
         const Matrix<ElemType>& outGrd, // the error to h_t from upper layer
         const Matrix<ElemType>& obs,
         Matrix<ElemType>& grdToObs,
-            const Matrix<ElemType>& mInputGate,
+        const Matrix<ElemType>& mInputGate,
         Matrix<ElemType>& grdToInputGate,
         const Matrix<ElemType>& mForgetGate,
         Matrix<ElemType>& grdToForgetGate,
         const Matrix<ElemType>& mOutputGate,
-            Matrix<ElemType>& grdToOutputGate,
+        Matrix<ElemType>& grdToOutputGate,
         const Matrix<ElemType>& mCellWgt,
         Matrix<ElemType>& grdToCellWgt,
-            const Matrix<ElemType>& prevOutput,
-            const Matrix<ElemType>& prevState,
+        const Matrix<ElemType>& prevOutput,
+        const Matrix<ElemType>& prevState,
         const Matrix<ElemType>& stateError, // the error propagated to cell from t+1
         const Matrix<ElemType>& state,
         const Matrix<ElemType>& tanhState,
@@ -2257,277 +2258,277 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         Matrix<ElemType>& grdToPrevOutput,
         Matrix<ElemType>& grdToPrevState,
         Matrix<ElemType>& tmpMat)
-        {
-            int inputDim = obs.GetNumRows();
-            int outputDim = mOutputGate.GetNumRows();
-
-            assert(grdToPrevOutput.FrobeniusNorm() == 0);
-            assert(grdToPrevState.FrobeniusNorm() == 0);
-            assert(state.FrobeniusNorm() > 0);
-            Matrix<ElemType> Who = mOutputGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> Wco = mOutputGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> Wxo = mOutputGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdToWho = grdToOutputGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> grdToWco = grdToOutputGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> grdToWxo = grdToOutputGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdTobo = grdToOutputGate.ColumnSlice(0, 1);
-
-            Matrix<ElemType> Whf = mForgetGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> Wcf = mForgetGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> Wxf = mForgetGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdToWhf = grdToForgetGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> grdToWcf = grdToForgetGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> grdToWxf = grdToForgetGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdTobf = grdToForgetGate.ColumnSlice(0, 1);
-
-            Matrix<ElemType> Wxc = mCellWgt.ColumnSlice(1, inputDim);
-            Matrix<ElemType> Whc = mCellWgt.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> grdToWxc = grdToCellWgt.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdToWhc = grdToCellWgt.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> grdTobc = grdToCellWgt.ColumnSlice(0, 1);
-
-            Matrix<ElemType> Whi = mInputGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> Wci = mInputGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> Wxi = mInputGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdToWhi = grdToInputGate.ColumnSlice(1 + inputDim, outputDim);
-            Matrix<ElemType> grdToWci = grdToInputGate.ColumnSlice(1 + inputDim + outputDim, 1);
-            Matrix<ElemType> grdToWxi = grdToInputGate.ColumnSlice(1, inputDim);
-            Matrix<ElemType> grdTobi = grdToInputGate.ColumnSlice(0, 1);
-
-            // error backpropagate to output gate
-            Matrix<ElemType> grdToGo(tmpMat.GetDeviceId()), gradientOfSigmoid(tmpMat.GetDeviceId());
-            Matrix<ElemType> grdBeforeGo(tmpMat.GetDeviceId()), grdBeforeInputGate(tmpMat.GetDeviceId());
-            Matrix<ElemType> grdToCell(tmpMat.GetDeviceId());
+    {
+        int inputDim = obs.GetNumRows();
+        int outputDim = mOutputGate.GetNumRows();
+
+        assert(grdToPrevOutput.FrobeniusNorm() == 0);
+        assert(grdToPrevState.FrobeniusNorm() == 0);
+        assert(state.FrobeniusNorm() > 0);
+        Matrix<ElemType> Who = mOutputGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> Wco = mOutputGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> Wxo = mOutputGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdToWho = grdToOutputGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> grdToWco = grdToOutputGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> grdToWxo = grdToOutputGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdTobo = grdToOutputGate.ColumnSlice(0, 1);
+
+        Matrix<ElemType> Whf = mForgetGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> Wcf = mForgetGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> Wxf = mForgetGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdToWhf = grdToForgetGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> grdToWcf = grdToForgetGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> grdToWxf = grdToForgetGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdTobf = grdToForgetGate.ColumnSlice(0, 1);
+
+        Matrix<ElemType> Wxc = mCellWgt.ColumnSlice(1, inputDim);
+        Matrix<ElemType> Whc = mCellWgt.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> grdToWxc = grdToCellWgt.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdToWhc = grdToCellWgt.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> grdTobc = grdToCellWgt.ColumnSlice(0, 1);
+
+        Matrix<ElemType> Whi = mInputGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> Wci = mInputGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> Wxi = mInputGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdToWhi = grdToInputGate.ColumnSlice(1 + inputDim, outputDim);
+        Matrix<ElemType> grdToWci = grdToInputGate.ColumnSlice(1 + inputDim + outputDim, 1);
+        Matrix<ElemType> grdToWxi = grdToInputGate.ColumnSlice(1, inputDim);
+        Matrix<ElemType> grdTobi = grdToInputGate.ColumnSlice(0, 1);
+
+        // error backpropagate to output gate
+        Matrix<ElemType> grdToGo(tmpMat.GetDeviceId()), gradientOfSigmoid(tmpMat.GetDeviceId());
+        Matrix<ElemType> grdBeforeGo(tmpMat.GetDeviceId()), grdBeforeInputGate(tmpMat.GetDeviceId());
+        Matrix<ElemType> grdToCell(tmpMat.GetDeviceId());
 
         tmpMat.AssignElementProductOf(outGrd, tanhState); // error to o_t
-            gradientOfSigmoid.AssignSigmoidDerivativeOf(go);
+        gradientOfSigmoid.AssignSigmoidDerivativeOf(go);
         grdBeforeGo.AssignElementProductOf(tmpMat, gradientOfSigmoid); // error before softmax
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "output gate error = %.4e\n", grdBeforeGo(0, 0));
+        fprintf(stderr, "output gate error = %.4e\n", grdBeforeGo(0, 0));
 #endif
         Matrix<ElemType>::MultiplyAndAdd(Who, true, grdBeforeGo, false, grdToPrevOutput); // error to previous output
         Matrix<ElemType>::MultiplyAndAdd(Wxo, true, grdBeforeGo, false, grdToObs);        // error to observation
-            tmpMat = grdBeforeGo;
-            tmpMat.ColumnElementMultiplyWith(Wco);
+        tmpMat = grdBeforeGo;
+        tmpMat.ColumnElementMultiplyWith(Wco);
         grdToCell = tmpMat; // error to memory cell
 
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeGo, false, prevOutput, true, grdToWho); // gradient to Who
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeGo, false, prevOutput, true, grdToWho); // gradient to Who
         Matrix<ElemType>::MultiplyAndAdd(grdBeforeGo, false, obs, true, grdToWxo);        // gradient to Wxo
-            tmpMat.AssignInnerProductOf(grdBeforeGo, state, false);
+        tmpMat.AssignInnerProductOf(grdBeforeGo, state, false);
         grdToWco += tmpMat; // to Wco
-            for (size_t i = 0; i < grdBeforeGo.GetNumCols(); i++)
-            {
+        for (size_t i = 0; i < grdBeforeGo.GetNumCols(); i++)
+        {
             grdTobo += grdBeforeGo.ColumnSlice(i, 1); // gradient to bo
-            }
+        }
 
         grdToGo.AssignElementProductOf(outGrd, go);            // error to tanh
-            GradientOfTanh(tanhState, grdToGo, grdToCell, tmpMat); // error to memory cell
+        GradientOfTanh(tanhState, grdToGo, grdToCell, tmpMat); // error to memory cell
         grdToCell += stateError;                               // add error to memory cell from t+1
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "previous state[0] = %.4e norm = %.4e\n", prevState(0, 0), prevState.FrobeniusNorm());
-            fprintf(stderr, "state error = %.4e\n", grdToCell(0, 0));
-            fprintf(stderr, "state error norm = %.4e\n", grdToCell.FrobeniusNorm());
+        fprintf(stderr, "previous state[0] = %.4e norm = %.4e\n", prevState(0, 0), prevState.FrobeniusNorm());
+        fprintf(stderr, "state error = %.4e\n", grdToCell(0, 0));
+        fprintf(stderr, "state error norm = %.4e\n", grdToCell.FrobeniusNorm());
 #endif
-            // error backpropagate to memory cells
+        // error backpropagate to memory cells
         grdToPrevState.AssignElementProductOf(gf, grdToCell); // error to previous memory cell
-            // be careful, need to double check if errors are missing
+        // be careful, need to double check if errors are missing
 
-            Matrix<ElemType> grdBeforeForget(tmpMat.GetDeviceId());
+        Matrix<ElemType> grdBeforeForget(tmpMat.GetDeviceId());
         tmpMat.AssignElementProductOf(prevState, grdToCell); // error to f_t
-            gradientOfSigmoid.AssignSigmoidDerivativeOf(gf);
-            grdBeforeForget.AssignElementProductOf(gradientOfSigmoid, tmpMat); // error before forget gate
+        gradientOfSigmoid.AssignSigmoidDerivativeOf(gf);
+        grdBeforeForget.AssignElementProductOf(gradientOfSigmoid, tmpMat); // error before forget gate
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "forget gate error = %.4e\n", grdBeforeForget(0, 0));
+        fprintf(stderr, "forget gate error = %.4e\n", grdBeforeForget(0, 0));
 #endif
 
         Matrix<ElemType>::MultiplyAndAdd(Whf, true, grdBeforeForget, false, grdToPrevOutput); // error to previous output
-            tmpMat = grdBeforeForget;
-            tmpMat.ColumnElementMultiplyWith(Wcf);
+        tmpMat = grdBeforeForget;
+        tmpMat.ColumnElementMultiplyWith(Wcf);
         grdToPrevState += tmpMat; // error to previous state
 
         Matrix<ElemType>::MultiplyAndAdd(Wxf, true, grdBeforeForget, false, grdToObs); // error to observation
 
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeForget, false, prevOutput, true, grdToWhf); // gradient to Whf
-            tmpMat.AssignInnerProductOf(grdBeforeForget, prevState, false);
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeForget, false, prevOutput, true, grdToWhf); // gradient to Whf
+        tmpMat.AssignInnerProductOf(grdBeforeForget, prevState, false);
         grdToWcf += tmpMat; // gradient to Wcf
 
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeForget, false, obs, true, grdToWxf); // gradient to Wxf
-            for (size_t i = 0; i < grdBeforeForget.GetNumCols(); i++)
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeForget, false, obs, true, grdToWxf); // gradient to Wxf
+        for (size_t i = 0; i < grdBeforeForget.GetNumCols(); i++)
             grdTobf += grdBeforeForget.ColumnSlice(i, 1); // gradient to bf
 
-            // error backpropagate to input gate
-            tmpMat.AssignElementProductOf(tanhBeforeApplyingInputGating, grdToCell);
-            gradientOfSigmoid.AssignSigmoidDerivativeOf(gi);
-            grdBeforeInputGate.AssignElementProductOf(gradientOfSigmoid, tmpMat); // error before input gate
+        // error backpropagate to input gate
+        tmpMat.AssignElementProductOf(tanhBeforeApplyingInputGating, grdToCell);
+        gradientOfSigmoid.AssignSigmoidDerivativeOf(gi);
+        grdBeforeInputGate.AssignElementProductOf(gradientOfSigmoid, tmpMat); // error before input gate
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "input gate error = %.4e\n", grdBeforeInputGate(0, 0));
+        fprintf(stderr, "input gate error = %.4e\n", grdBeforeInputGate(0, 0));
 #endif
 
         Matrix<ElemType>::MultiplyAndAdd(Whi, true, grdBeforeInputGate, false, grdToPrevOutput); // error to previous output
-            tmpMat = grdBeforeInputGate;
-            tmpMat.ColumnElementMultiplyWith(Wci);
+        tmpMat = grdBeforeInputGate;
+        tmpMat.ColumnElementMultiplyWith(Wci);
         grdToPrevState += tmpMat; // error to previous state
 
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "to previous state error = %.4e\n", grdToPrevState(0, 0));
-            fprintf(stderr, "to previous state error norm = %.4e\n", grdToPrevState.FrobeniusNorm());
+        fprintf(stderr, "to previous state error = %.4e\n", grdToPrevState(0, 0));
+        fprintf(stderr, "to previous state error norm = %.4e\n", grdToPrevState.FrobeniusNorm());
 #endif
         Matrix<ElemType>::MultiplyAndAdd(Wxi, true, grdBeforeInputGate, false, grdToObs); // error to observation
 
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeInputGate, false, prevOutput, true, grdToWhi); // gradient to Whi
-            tmpMat.AssignInnerProductOf(grdBeforeInputGate, prevState, false);
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeInputGate, false, prevOutput, true, grdToWhi); // gradient to Whi
+        tmpMat.AssignInnerProductOf(grdBeforeInputGate, prevState, false);
         grdToWci += tmpMat;                                                               // gradient to Wci
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeInputGate, false, obs, true, grdToWxi); // gradient to Wxi
-            for (size_t i = 0; i < grdBeforeInputGate.GetNumCols(); i++)
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeInputGate, false, obs, true, grdToWxi); // gradient to Wxi
+        for (size_t i = 0; i < grdBeforeInputGate.GetNumCols(); i++)
             grdTobi += grdBeforeInputGate.ColumnSlice(i, 1); // gradient to bi
 
-            // error backpropagate to inputs
-            Matrix<ElemType> grdTmp2(tmpMat.GetDeviceId());
-            Matrix<ElemType> grdBeforeTanhInputGate(tmpMat.GetDeviceId());
-            grdTmp2.AssignElementProductOf(gi, grdToCell);
-            grdBeforeTanhInputGate.Resize(tanhBeforeApplyingInputGating.GetNumRows(), tanhBeforeApplyingInputGating.GetNumCols());
-            GradientOfTanh(tanhBeforeApplyingInputGating, grdTmp2, grdBeforeTanhInputGate, tmpMat); // error to memory cell
+        // error backpropagate to inputs
+        Matrix<ElemType> grdTmp2(tmpMat.GetDeviceId());
+        Matrix<ElemType> grdBeforeTanhInputGate(tmpMat.GetDeviceId());
+        grdTmp2.AssignElementProductOf(gi, grdToCell);
+        grdBeforeTanhInputGate.Resize(tanhBeforeApplyingInputGating.GetNumRows(), tanhBeforeApplyingInputGating.GetNumCols());
+        GradientOfTanh(tanhBeforeApplyingInputGating, grdTmp2, grdBeforeTanhInputGate, tmpMat); // error to memory cell
         Matrix<ElemType>::MultiplyAndAdd(Wxc, true, grdBeforeTanhInputGate, false, grdToObs);   // error to observation
 #ifdef DEBUG_DECODER
-            fprintf(stderr, "to observation error = %.4e\n", grdToObs(0, 0));
+        fprintf(stderr, "to observation error = %.4e\n", grdToObs(0, 0));
 #endif
 
         Matrix<ElemType>::MultiplyAndAdd(Whc, true, grdBeforeTanhInputGate, false, grdToPrevOutput); // error to previous output
         Matrix<ElemType>::MultiplyAndAdd(grdBeforeTanhInputGate, false, obs, true, grdToWxc);        // gradient to Wxc
 
-            Matrix<ElemType>::MultiplyAndAdd(grdBeforeTanhInputGate, false, prevOutput, true, grdToWhc); // gradient to Whc
-            for (size_t i = 0; i < grdBeforeTanhInputGate.GetNumCols(); i++)
+        Matrix<ElemType>::MultiplyAndAdd(grdBeforeTanhInputGate, false, prevOutput, true, grdToWhc); // gradient to Whc
+        for (size_t i = 0; i < grdBeforeTanhInputGate.GetNumCols(); i++)
             grdTobc += grdBeforeTanhInputGate.ColumnSlice(i, 1); // gradient to bc
-        }
+    }
 
-        /**
+    /**
         get the segmentation information, SENTENECE_BEGIN, ((int) MinibatchPackingFlags::None), ((int) MinibatchPackingFlags::NoInput) 
         for time at t and stream of streamid
         */
-        int GetSegInfo(size_t t, size_t streamid)
-        {
-            if (streamid >= GetNumParallelSequences())
+    int GetSegInfo(size_t t, size_t streamid)
+    {
+        if (streamid >= GetNumParallelSequences())
             LogicError("GetSegInfo: stream id %d is larger than the number of streams %d", (int) streamid, (int) GetNumParallelSequences());
 
-            Matrix<float> thisCol; // BUGBUG: These flags no longer exist. This code is no longer functional.
-            //size_t nT = Input(0)->GetSampleMatrixNumCols();
-            //if (t >= nT)
-            //    LogicError("GetSegInfo: time %d times is larger than the total number of observations %d", (int)t, (int)nT);
-            //int utt_t = (int)t / GetNumParallelSequences();
-            //auto thisCol = m_pMBLayout->GetFrame(utt_t).first;
-            thisCol.Reshape(1, GetNumParallelSequences());
-            return (int) thisCol.ColumnSlice(streamid, 1).Get00Element();
-        }
+        Matrix<float> thisCol; // BUGBUG: These flags no longer exist. This code is no longer functional.
+        //size_t nT = Input(0)->GetSampleMatrixNumCols();
+        //if (t >= nT)
+        //    LogicError("GetSegInfo: time %d times is larger than the total number of observations %d", (int)t, (int)nT);
+        //int utt_t = (int)t / GetNumParallelSequences();
+        //auto thisCol = m_pMBLayout->GetFrame(utt_t).first;
+        thisCol.Reshape(1, GetNumParallelSequences());
+        return (int) thisCol.ColumnSlice(streamid, 1).Get00Element();
+    }
 
-        /**
+    /**
         save the last hidden layer activity and output
         */
-        void SaveLastStateActity()
-        {
-            size_t nT = Input(0)->GetSampleMatrixNumCols();
-            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
-            
-            // save the hidden activities and output for the next minibatch
-            mLastOutput.Resize(outputDim, GetNumParallelSequences());
-            mLastState.Resize(outputDim, GetNumParallelSequences());
+    void SaveLastStateActity()
+    {
+        size_t nT = Input(0)->GetSampleMatrixNumCols();
+        size_t outputDim = Input(1)->GetSampleMatrixNumRows();
+
+        // save the hidden activities and output for the next minibatch
+        mLastOutput.Resize(outputDim, GetNumParallelSequences());
+        mLastState.Resize(outputDim, GetNumParallelSequences());
 
-            for (size_t i = 0; i < GetNumParallelSequences(); i++)
+        for (size_t i = 0; i < GetNumParallelSequences(); i++)
+        {
+            for (int t = nT - GetNumParallelSequences() + i; t >= 0; t -= GetNumParallelSequences())
             {
-                for (int t = nT - GetNumParallelSequences() + i; t >= 0; t -= GetNumParallelSequences())
+                if (GetSegInfo(t, i) == ((int) MinibatchPackingFlags::None))
                 {
-                    if (GetSegInfo(t, i) == ((int) MinibatchPackingFlags::None))
-                    {
-                        mLastOutput.ColumnSlice(i, 1).SetValue(Value().ColumnSlice(t, 1));
-                        mLastState.ColumnSlice(i, 1).SetValue(m_State.ColumnSlice(t, 1));
-                        break;
-                    }
+                    mLastOutput.ColumnSlice(i, 1).SetValue(Value().ColumnSlice(t, 1));
+                    mLastState.ColumnSlice(i, 1).SetValue(m_State.ColumnSlice(t, 1));
+                    break;
                 }
             }
         }
+    }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
-        {
-            size_t nT = Input(0)->GetSampleMatrixNumCols();
-            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
+    {
+        size_t nT = Input(0)->GetSampleMatrixNumCols();
+        size_t outputDim = Input(1)->GetSampleMatrixNumRows();
 
-            {
-                SetDims1(outputDim, nT);
+        {
+            SetDims1(outputDim, nT);
             Value().SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                m_State.Resize(outputDim, nT);
+            m_State.Resize(outputDim, nT);
             m_State.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                m_Gi.Resize(outputDim, nT);
+            m_Gi.Resize(outputDim, nT);
             m_Gi.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                m_Gf.Resize(outputDim, nT);
+            m_Gf.Resize(outputDim, nT);
             m_Gf.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                m_Go.Resize(outputDim, nT);
+            m_Go.Resize(outputDim, nT);
             m_Go.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                tanhState.Resize(outputDim, nT);
+            tanhState.Resize(outputDim, nT);
             tanhState.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
-                tanhObs.Resize(outputDim, nT);
+            tanhObs.Resize(outputDim, nT);
             tanhObs.SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
 
-                if (m_PastState.IsEmpty() || m_PastState.GetNumCols() != GetNumParallelSequences())
-                {
-                    m_PastState.Resize(outputDim, GetNumParallelSequences());
-                    m_PastState.SetValue(m_DefaultState);
-                }
-                if (m_PastOutput.IsEmpty() || m_PastOutput.GetNumCols() != GetNumParallelSequences())
-                {
-                    m_PastOutput.Resize(outputDim, GetNumParallelSequences());
-                }
+            if (m_PastState.IsEmpty() || m_PastState.GetNumCols() != GetNumParallelSequences())
+            {
+                m_PastState.Resize(outputDim, GetNumParallelSequences());
+                m_PastState.SetValue(m_DefaultState);
+            }
+            if (m_PastOutput.IsEmpty() || m_PastOutput.GetNumCols() != GetNumParallelSequences())
+            {
+                m_PastOutput.Resize(outputDim, GetNumParallelSequences());
+            }
 
 #ifdef DEBUG_DECODER
-                if (m_PastOutput.IsEmpty() == false)
-                    fprintf(stderr, "LSTM node %ls past output norm = %.8e\n", this->NodeName().c_str(), m_PastOutput.FrobeniusNorm());
-                if (m_PastState.IsEmpty() == false)
-                    fprintf(stderr, "LSTM node %ls past state norm = %.8e\n", this->NodeName().c_str(), m_PastState.FrobeniusNorm());
+            if (m_PastOutput.IsEmpty() == false)
+                fprintf(stderr, "LSTM node %ls past output norm = %.8e\n", this->NodeName().c_str(), m_PastOutput.FrobeniusNorm());
+            if (m_PastState.IsEmpty() == false)
+                fprintf(stderr, "LSTM node %ls past state norm = %.8e\n", this->NodeName().c_str(), m_PastState.FrobeniusNorm());
 #endif
 
-                for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
-                {
-                    FrameRange fr(m_pMBLayout, timeIdxInSeq);
-                    Matrix<ElemType> sliceObs = Input(0)->ValueFor(fr);
-                    Matrix<ElemType> sliceOutput = ValueFor(fr);
-                    Matrix<ElemType> sliceState = DataFor(m_State, fr);
+            for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
+            {
+                FrameRange fr(m_pMBLayout, timeIdxInSeq);
+                Matrix<ElemType> sliceObs = Input(0)->ValueFor(fr);
+                Matrix<ElemType> sliceOutput = ValueFor(fr);
+                Matrix<ElemType> sliceState = DataFor(m_State, fr);
 
-                    Matrix<ElemType> sliceGi = DataFor(m_Gi, fr);
-                    Matrix<ElemType> sliceGf = DataFor(m_Gf, fr);
-                    Matrix<ElemType> sliceGo = DataFor(m_Go, fr);
+                Matrix<ElemType> sliceGi = DataFor(m_Gi, fr);
+                Matrix<ElemType> sliceGf = DataFor(m_Gf, fr);
+                Matrix<ElemType> sliceGo = DataFor(m_Go, fr);
 
-                    Matrix<ElemType> sliceTanhState = DataFor(tanhState, fr);
-                    Matrix<ElemType> sliceTanhInput = DataFor(tanhObs, fr);
+                Matrix<ElemType> sliceTanhState = DataFor(tanhState, fr);
+                Matrix<ElemType> sliceTanhInput = DataFor(tanhObs, fr);
 
                 PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, Value(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, nullptr /*&m_pMBLayout->GetM()*/ /*BUGBUG: no longer functional*/);
 
-                    ForwardPropS(Input(1)->Value(), Input(2)->Value(), Input(3)->Value(), Input(4)->Value(),
-                            sliceObs, mSlicePrevOutput, mSlicePrevState, sliceOutput, sliceState, sliceGi, sliceGf, sliceGo, sliceTanhState, sliceTanhInput, m_tempMatrix);
-                }
+                ForwardPropS(Input(1)->Value(), Input(2)->Value(), Input(3)->Value(), Input(4)->Value(),
+                             sliceObs, mSlicePrevOutput, mSlicePrevState, sliceOutput, sliceState, sliceGi, sliceGf, sliceGo, sliceTanhState, sliceTanhInput, m_tempMatrix);
+            }
 
-                // save the hidden activities and output for the next minibatch
-                SaveLastStateActity();
+            // save the hidden activities and output for the next minibatch
+            SaveLastStateActity();
 
 #ifdef DEBUG_DECODER
-                if (mLastOutput.IsEmpty() == false)
-                    fprintf(stderr, "LSTM node %ls last output norm = %.8e\n", this->NodeName().c_str(), mLastOutput.FrobeniusNorm());
-                if (mLastState.IsEmpty() == false)
-                    fprintf(stderr, "LSTM node %ls last state norm = %.8e\n", this->NodeName().c_str(), mLastState.FrobeniusNorm());
+            if (mLastOutput.IsEmpty() == false)
+                fprintf(stderr, "LSTM node %ls last output norm = %.8e\n", this->NodeName().c_str(), mLastOutput.FrobeniusNorm());
+            if (mLastState.IsEmpty() == false)
+                fprintf(stderr, "LSTM node %ls last state norm = %.8e\n", this->NodeName().c_str(), mLastState.FrobeniusNorm());
 #endif
 
 #ifdef DEBUG_DECODER
-                ElemType tmpnorm = Value().FrobeniusNorm();
-                if (ISCLOSE(tmpnorm, 0.834251, 0.002))
-                    fprintf(stderr, "check!");
-                fprintf(stderr, "LSTM function norm = %.8e\n", tmpnorm);
-                for (size_t i = 0; i < 5; i++)
-                    fprintf(stderr, "LSTM input[%d] norm = %.8e ", i, Input(i)->Value().FrobeniusNorm());
-                fprintf(stderr, "\n");
+            ElemType tmpnorm = Value().FrobeniusNorm();
+            if (ISCLOSE(tmpnorm, 0.834251, 0.002))
+                fprintf(stderr, "check!");
+            fprintf(stderr, "LSTM function norm = %.8e\n", tmpnorm);
+            for (size_t i = 0; i < 5; i++)
+                fprintf(stderr, "LSTM input[%d] norm = %.8e ", i, Input(i)->Value().FrobeniusNorm());
+            fprintf(stderr, "\n");
 #endif
 
-                m_GradientComputed = false;
-            }
+            m_GradientComputed = false;
         }
+    }
 
-        /**
+    /**
         Prepare history for LSTMnode
 
         This function returns state and output from the previous time instance. For recurrent network, the initial state needs to be set in the case of sentence begining, which is carried over from sentenceBegin. In case of sentence begining, the state activity is set to an initial value. The sentenceBegin has element of ((int) MinibatchPackingFlags::SequenceStart), ((int) MinibatchPackingFlags::None) and ((int) MinibatchPackingFlags::NoInput), which are 0, 1, and -1, respectively. 
@@ -2538,9 +2539,9 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         prevOutput = sentenceBegin * pastOutput
 
         */
-        // prepare prevstate and prevoutput
-        static void WINAPI PrepareHistory(
-            size_t timeIdxInSeq,
+    // prepare prevstate and prevoutput
+    static void WINAPI PrepareHistory(
+        size_t timeIdxInSeq,
         Matrix<ElemType>& slicePrevOutput,
         Matrix<ElemType>& slicePrevState,
         const Matrix<ElemType>& output,
@@ -2548,165 +2549,165 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         const Matrix<ElemType>& pastOutput,
         const Matrix<ElemType>& pastState,
         size_t nsamples, const ElemType& initStateValue, const Matrix<float>* sentenceBegin)
-        {
-            size_t nRow = pastOutput.GetNumRows();
-            size_t nStream = sentenceBegin->GetNumRows();
+    {
+        size_t nRow = pastOutput.GetNumRows();
+        size_t nStream = sentenceBegin->GetNumRows();
 
-            assert(nStream == nsamples);
+        assert(nStream == nsamples);
 
         int utt_t = (int) floor(timeIdxInSeq / nsamples);
-            if (slicePrevOutput.IsEmpty() || slicePrevOutput.GetNumRows() != nRow || slicePrevOutput.GetNumCols() != nsamples)
-                slicePrevOutput.Resize(nRow, nsamples);
-            if (slicePrevState.IsEmpty() || slicePrevState.GetNumRows() != nRow || slicePrevState.GetNumCols() != nsamples)
-                slicePrevState.Resize(nRow, nsamples);
-
-            if (sentenceBegin->GetNumRows() != nsamples)
-                LogicError("Number of rows should be the same as the number of data streams");
-
-            Matrix<float> colBegin(sentenceBegin->GetDeviceId());
-            colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
-            Matrix<ElemType> colSeg(colBegin.GetDeviceId());
-            colSeg.Resize(nStream, nStream);
-            // will reset to 0 if sentence begining at a position is 0
-            // will keep the output if it is not the sentence begining
-            colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::SequenceStart));
-            colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::None));
+        if (slicePrevOutput.IsEmpty() || slicePrevOutput.GetNumRows() != nRow || slicePrevOutput.GetNumCols() != nsamples)
+            slicePrevOutput.Resize(nRow, nsamples);
+        if (slicePrevState.IsEmpty() || slicePrevState.GetNumRows() != nRow || slicePrevState.GetNumCols() != nsamples)
+            slicePrevState.Resize(nRow, nsamples);
+
+        if (sentenceBegin->GetNumRows() != nsamples)
+            LogicError("Number of rows should be the same as the number of data streams");
+
+        Matrix<float> colBegin(sentenceBegin->GetDeviceId());
+        colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
+        Matrix<ElemType> colSeg(colBegin.GetDeviceId());
+        colSeg.Resize(nStream, nStream);
+        // will reset to 0 if sentence begining at a position is 0
+        // will keep the output if it is not the sentence begining
+        colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::SequenceStart));
+        colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::None));
 #if 1
         initStateValue;
         pastState;
         pastOutput;
         state;
         output;
-            LogicError("PrepareHistory: finish this");
+        LogicError("PrepareHistory: finish this");
 #else
-            // BUGBUG: we need to upcast float to double here
-            colSeg.SetDiagonalValue(colBegin);
+        // BUGBUG: we need to upcast float to double here
+        colSeg.SetDiagonalValue(colBegin);
 
-            Matrix<ElemType> newPrevOutput(colBegin.GetDeviceId());
-            Matrix<ElemType> newPrevState(colBegin.GetDeviceId());
-            if (utt_t == 0)
-            {
-                // this is the begining of this minibatch
-                Matrix<ElemType>::Multiply(pastOutput.ColumnSlice(0, nsamples), false, colSeg, false, newPrevOutput);
-                Matrix<ElemType>::Multiply(pastState.ColumnSlice(0, nsamples), false, colSeg, false, newPrevState);
-            }
-            else
-            {
-                // this is in the minibatch
-                FrameRange fr(timeIdxInSeq, nsamples);
+        Matrix<ElemType> newPrevOutput(colBegin.GetDeviceId());
+        Matrix<ElemType> newPrevState(colBegin.GetDeviceId());
+        if (utt_t == 0)
+        {
+            // this is the begining of this minibatch
+            Matrix<ElemType>::Multiply(pastOutput.ColumnSlice(0, nsamples), false, colSeg, false, newPrevOutput);
+            Matrix<ElemType>::Multiply(pastState.ColumnSlice(0, nsamples), false, colSeg, false, newPrevState);
+        }
+        else
+        {
+            // this is in the minibatch
+            FrameRange fr(timeIdxInSeq, nsamples);
             Matrix<ElemType>::Multiply(DataFor(output, fr /*TODO: delete the next two parameters*/, fr.t() - nsamples, nsamples), false, colSeg, false, newPrevOutput);
             Matrix<ElemType>::Multiply(DataFor(state, fr /*TODO: delete the next two parameters*/, fr.t() - nsamples, nsamples), false, colSeg, false, newPrevState);
-            }
+        }
 
-            Base::SetToInitStateValueForResetSeg(sentenceBegin->ColumnSlice(utt_t, 1), nStream, initStateValue, newPrevState);
+        Base::SetToInitStateValueForResetSeg(sentenceBegin->ColumnSlice(utt_t, 1), nStream, initStateValue, newPrevState);
 
-            slicePrevOutput.ColumnSlice(0, nsamples).SetValue(newPrevOutput);
-            slicePrevState.ColumnSlice(0, nsamples).SetValue(newPrevState);
+        slicePrevOutput.ColumnSlice(0, nsamples).SetValue(newPrevOutput);
+        slicePrevState.ColumnSlice(0, nsamples).SetValue(newPrevState);
 #endif
-        }
+    }
 
-        // prepare prevstate and prevoutput
-        void PrepareThisErrorsBeforeBackProp(
-            size_t timeIdxInSeq,
-            size_t nT, // number of columns
+    // prepare prevstate and prevoutput
+    void PrepareThisErrorsBeforeBackProp(
+        size_t timeIdxInSeq,
+        size_t nT, // number of columns
         Matrix<ElemType>& error,
         Matrix<ElemType>& stateError,
-            const Matrix<ElemType>& grdToPrevOutput,
-            const Matrix<ElemType>& grdToPrevState,
-            const Matrix<ElemType>& obs_error_from_future_minibatch,
-            const Matrix<ElemType>& state_error_from_future_minibatch,
-            size_t nsamples, const Matrix<float>* sentenceBegin)
-        {
+        const Matrix<ElemType>& grdToPrevOutput,
+        const Matrix<ElemType>& grdToPrevState,
+        const Matrix<ElemType>& obs_error_from_future_minibatch,
+        const Matrix<ElemType>& state_error_from_future_minibatch,
+        size_t nsamples, const Matrix<float>* sentenceBegin)
+    {
         int utt_t = (int) floor(timeIdxInSeq / nsamples);
         int total_utt_t = (int) floor(nT / nsamples);
 
-            error += grdToPrevOutput;
-            stateError = grdToPrevState;
+        error += grdToPrevOutput;
+        stateError = grdToPrevState;
 
-            if (m_use_errors_from_future_minibatch)
+        if (m_use_errors_from_future_minibatch)
+        {
+            for (size_t utt_id = 0; utt_id < nsamples; utt_id++)
             {
-                for (size_t utt_id = 0; utt_id < nsamples; utt_id++)
-                {
-                    // if uses errors from future minibatch
+                // if uses errors from future minibatch
                 if ((GetSegInfo(timeIdxInSeq, utt_id) == ((int) MinibatchPackingFlags::None) && utt_t == total_utt_t - 1)                                                                                            // last time
-                        || (utt_t < total_utt_t - 1 && GetSegInfo(timeIdxInSeq, utt_id) == ((int) MinibatchPackingFlags::None) && GetSegInfo(timeIdxInSeq + nsamples, utt_id) == ((int) MinibatchPackingFlags::NoInput)) // future observation is no observation
-                        )
-                    {
-                        error.ColumnSlice(utt_id, 1) += obs_error_from_future_minibatch.ColumnSlice(utt_id, 1);
-                        stateError.ColumnSlice(utt_id, 1) += state_error_from_future_minibatch.ColumnSlice(utt_id, 1);
-                    }
+                    || (utt_t < total_utt_t - 1 && GetSegInfo(timeIdxInSeq, utt_id) == ((int) MinibatchPackingFlags::None) && GetSegInfo(timeIdxInSeq + nsamples, utt_id) == ((int) MinibatchPackingFlags::NoInput)) // future observation is no observation
+                    )
+                {
+                    error.ColumnSlice(utt_id, 1) += obs_error_from_future_minibatch.ColumnSlice(utt_id, 1);
+                    stateError.ColumnSlice(utt_id, 1) += state_error_from_future_minibatch.ColumnSlice(utt_id, 1);
                 }
             }
+        }
 
 #if 1
-            sentenceBegin;
-            LogicError("PrepareThisErrorsBeforeBackProp: finish this");
+        sentenceBegin;
+        LogicError("PrepareThisErrorsBeforeBackProp: finish this");
 #else
-            Matrix<ElemType> colBegin(sentenceBegin->GetDeviceId());
-            colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
-            colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::NoInput));
-            colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::SequenceStart));
-            colBegin += fabs((ElemType)((int) MinibatchPackingFlags::NoInput)); // raise this so that -1 -> 0 and therefore 
-            Matrix<ElemType> colSeg(colBegin.GetDeviceId());
-            colSeg.Resize(nsamples, nsamples);
-            colSeg.SetDiagonalValue(colBegin);
-
-            // times the errors with the mask
-            Matrix<ElemType> newOutputError(colBegin.GetDeviceId());
-            Matrix<ElemType> newStateError(colBegin.GetDeviceId());
-
-            Matrix<ElemType>::Multiply(error, false, colSeg, false, newOutputError);
-            Matrix<ElemType>::Multiply(stateError, false, colSeg, false, newStateError);
-            
-            error.ColumnSlice(0, nsamples).SetValue(newOutputError);
-            stateError.ColumnSlice(0, nsamples).SetValue(newStateError);
+        Matrix<ElemType> colBegin(sentenceBegin->GetDeviceId());
+        colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
+        colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::NoInput));
+        colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::SequenceStart));
+        colBegin += fabs((ElemType)((int) MinibatchPackingFlags::NoInput)); // raise this so that -1 -> 0 and therefore
+        Matrix<ElemType> colSeg(colBegin.GetDeviceId());
+        colSeg.Resize(nsamples, nsamples);
+        colSeg.SetDiagonalValue(colBegin);
+
+        // times the errors with the mask
+        Matrix<ElemType> newOutputError(colBegin.GetDeviceId());
+        Matrix<ElemType> newStateError(colBegin.GetDeviceId());
+
+        Matrix<ElemType>::Multiply(error, false, colSeg, false, newOutputError);
+        Matrix<ElemType>::Multiply(stateError, false, colSeg, false, newStateError);
+
+        error.ColumnSlice(0, nsamples).SetValue(newOutputError);
+        stateError.ColumnSlice(0, nsamples).SetValue(newStateError);
 #endif
-        }
+    }
 
-        // prepare prevstate and prevoutput
-        static void WINAPI PrepareErrors(
-            size_t timeIdxInSeq,
+    // prepare prevstate and prevoutput
+    static void WINAPI PrepareErrors(
+        size_t timeIdxInSeq,
         Matrix<ElemType>& errors,
         Matrix<ElemType>& stateError,
-            size_t nsamples, const Matrix<float>* sentenceBegin)
-        {
+        size_t nsamples, const Matrix<float>* sentenceBegin)
+    {
         int utt_t = (int) floor(timeIdxInSeq / nsamples);
-            Matrix<ElemType> colBegin(sentenceBegin->GetDeviceId());
+        Matrix<ElemType> colBegin(sentenceBegin->GetDeviceId());
 #if 1
         errors;
         stateError;
         utt_t;
-            LogicError("PrepareErrors: finish this");
+        LogicError("PrepareErrors: finish this");
 #else
-            colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
-            // will reset to 0 if sentence begining at a posiiton is 0
-            // will keep the output if it is not the sentence begining
-            colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::SequenceStart));
-            colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::None));
+        colBegin.SetValue(sentenceBegin->ColumnSlice(utt_t, 1));
+        // will reset to 0 if sentence begining at a posiiton is 0
+        // will keep the output if it is not the sentence begining
+        colBegin.InplaceTruncateBottom(((int) MinibatchPackingFlags::SequenceStart));
+        colBegin.InplaceTruncateTop(((int) MinibatchPackingFlags::None));
 
-            Matrix<ElemType> colSeg(colBegin.GetDeviceId());
-            colSeg.Resize(nsamples, nsamples);
-            colSeg.SetDiagonalValue(colBegin);
+        Matrix<ElemType> colSeg(colBegin.GetDeviceId());
+        colSeg.Resize(nsamples, nsamples);
+        colSeg.SetDiagonalValue(colBegin);
 
-            // times the errors with the mask
-            Matrix<ElemType> newOutputError(colBegin.GetDeviceId());
-            Matrix<ElemType> newStateError(colBegin.GetDeviceId());
+        // times the errors with the mask
+        Matrix<ElemType> newOutputError(colBegin.GetDeviceId());
+        Matrix<ElemType> newStateError(colBegin.GetDeviceId());
 
-            Matrix<ElemType>::Multiply(errors, false, colSeg, false, newOutputError);
-            Matrix<ElemType>::Multiply(stateError, false, colSeg, false, newStateError);
+        Matrix<ElemType>::Multiply(errors, false, colSeg, false, newOutputError);
+        Matrix<ElemType>::Multiply(stateError, false, colSeg, false, newStateError);
 
-            errors.ColumnSlice(0, nsamples).SetValue(newOutputError);
-            stateError.ColumnSlice(0, nsamples).SetValue(newStateError);
+        errors.ColumnSlice(0, nsamples).SetValue(newOutputError);
+        stateError.ColumnSlice(0, nsamples).SetValue(newStateError);
 #endif
-        }
+    }
 
     /*TODO: merge with call site*/ void ForwardPropS(
-            const Matrix<ElemType>& mInputGate,
+        const Matrix<ElemType>& mInputGate,
         const Matrix<ElemType>& mForgetGate, const Matrix<ElemType>& mOutputGate,
         const Matrix<ElemType>& mCellWgt,
         const Matrix<ElemType>& obs,
-            const Matrix<ElemType>& prevOutput,
-            const Matrix<ElemType>& prevState,
+        const Matrix<ElemType>& prevOutput,
+        const Matrix<ElemType>& prevState,
         Matrix<ElemType>& output,
         Matrix<ElemType>& state,
         Matrix<ElemType>& gi,
@@ -2715,68 +2716,68 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         Matrix<ElemType>& tanhState,
         Matrix<ElemType>& tanhObs,
         Matrix<ElemType>& tmp)
-        {
-            int inputDim = obs.GetNumRows();
-            int outputDim = mOutputGate.GetNumRows();
-
-            // for input gate
-            Matrix<ElemType>::Multiply(mInputGate.ColumnSlice(1, inputDim), false, obs, false, gi);
-            Matrix<ElemType>::MultiplyAndAdd(mInputGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, gi);
-            gi += mInputGate.ColumnSlice(0, 1);
-            tmp = prevState;
-            tmp.ColumnElementMultiplyWith(mInputGate.ColumnSlice(1 + inputDim + outputDim, 1));
-            gi += tmp;
-            gi.AssignSigmoidOf(gi);
-
-            // for forget gate
-            Matrix<ElemType>::Multiply(mForgetGate.ColumnSlice(1, inputDim), false, obs, false, gf);
-            Matrix<ElemType>::MultiplyAndAdd(mForgetGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, gf);
-            gf += mForgetGate.ColumnSlice(0, 1);
-            tmp = prevState;
-            tmp.ColumnElementMultiplyWith(mForgetGate.ColumnSlice(1 + inputDim + outputDim, 1));
-            gf += tmp;
-            gf.AssignSigmoidOf(gf);
-
-            // for cell state
-            Matrix<ElemType>::Multiply(mCellWgt.ColumnSlice(1, inputDim), false, obs, false, state);
-            Matrix<ElemType>::MultiplyAndAdd(mCellWgt.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, state);
-            state += mCellWgt.ColumnSlice(0, 1);
+    {
+        int inputDim = obs.GetNumRows();
+        int outputDim = mOutputGate.GetNumRows();
+
+        // for input gate
+        Matrix<ElemType>::Multiply(mInputGate.ColumnSlice(1, inputDim), false, obs, false, gi);
+        Matrix<ElemType>::MultiplyAndAdd(mInputGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, gi);
+        gi += mInputGate.ColumnSlice(0, 1);
+        tmp = prevState;
+        tmp.ColumnElementMultiplyWith(mInputGate.ColumnSlice(1 + inputDim + outputDim, 1));
+        gi += tmp;
+        gi.AssignSigmoidOf(gi);
+
+        // for forget gate
+        Matrix<ElemType>::Multiply(mForgetGate.ColumnSlice(1, inputDim), false, obs, false, gf);
+        Matrix<ElemType>::MultiplyAndAdd(mForgetGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, gf);
+        gf += mForgetGate.ColumnSlice(0, 1);
+        tmp = prevState;
+        tmp.ColumnElementMultiplyWith(mForgetGate.ColumnSlice(1 + inputDim + outputDim, 1));
+        gf += tmp;
+        gf.AssignSigmoidOf(gf);
+
+        // for cell state
+        Matrix<ElemType>::Multiply(mCellWgt.ColumnSlice(1, inputDim), false, obs, false, state);
+        Matrix<ElemType>::MultiplyAndAdd(mCellWgt.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, state);
+        state += mCellWgt.ColumnSlice(0, 1);
 #ifdef DEBUG_DECODER
 //            fprintf(stderr, "W_xc norm = %.8e\n", mCellWgt.ColumnSlice(1, inputDim).FrobeniusNorm());
 //            fprintf(stderr, "W_hc norm = %.8e\n", mCellWgt.ColumnSlice(1 + inputDim, outputDim).FrobeniusNorm());
 //            fprintf(stderr, "b_c norm = %.8e\n", mCellWgt.ColumnSlice(0, 1).FrobeniusNorm());
 #endif
-            tanhObs.AssignTanhOf(state);
-            state.AssignElementProductOf(gi, tanhObs);
-            state.AddElementProductOf(gf, prevState);
-
-            // for output gate
-            Matrix<ElemType>::Multiply(mOutputGate.ColumnSlice(1, inputDim), false, obs, false, go);
-            Matrix<ElemType>::MultiplyAndAdd(mOutputGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, go);
-            go += mOutputGate.ColumnSlice(0, 1);
-            tmp = state;
-            tmp.ColumnElementMultiplyWith(mOutputGate.ColumnSlice(1 + inputDim + outputDim, 1));
-            go += tmp;
-            go.AssignSigmoidOf(go);
-
-            // to return output
-            tanhState.AssignTanhOf(state);
-            output.AssignElementProductOf(go, tanhState);
-        }
+        tanhObs.AssignTanhOf(state);
+        state.AssignElementProductOf(gi, tanhObs);
+        state.AddElementProductOf(gf, prevState);
+
+        // for output gate
+        Matrix<ElemType>::Multiply(mOutputGate.ColumnSlice(1, inputDim), false, obs, false, go);
+        Matrix<ElemType>::MultiplyAndAdd(mOutputGate.ColumnSlice(1 + inputDim, outputDim), false, prevOutput, false, go);
+        go += mOutputGate.ColumnSlice(0, 1);
+        tmp = state;
+        tmp.ColumnElementMultiplyWith(mOutputGate.ColumnSlice(1 + inputDim + outputDim, 1));
+        go += tmp;
+        go.AssignSigmoidOf(go);
+
+        // to return output
+        tanhState.AssignTanhOf(state);
+        output.AssignElementProductOf(go, tanhState);
+    }
 
-        // input(0) : child with dimension [inputdim x T]
-        // input(1) : input gate [outputdim x [inputdim + outputdim + 2]] bi, Wxi, Whi, Wci
-        // input(2) : forget gate [outputdim x [inputdim + outputdim + 2]] for bf, Wxf, Whf, Wcf
-        // input(3) : output gate [outputdim x [inputdim + outputdim + 2]] for bo, Wxo, Who, and Wco
-        // input(4) : memory cell weight [outputdim x [inputdim + outputdim + 1]] for bc, Wxc, and Whc 
-        // output : dimension [outputdim x T]
+    // input(0) : child with dimension [inputdim x T]
+    // input(1) : input gate [outputdim x [inputdim + outputdim + 2]] bi, Wxi, Whi, Wci
+    // input(2) : forget gate [outputdim x [inputdim + outputdim + 2]] for bf, Wxf, Whf, Wcf
+    // input(3) : output gate [outputdim x [inputdim + outputdim + 2]] for bo, Wxo, Who, and Wco
+    // input(4) : memory cell weight [outputdim x [inputdim + outputdim + 1]] for bc, Wxc, and Whc
+    // output : dimension [outputdim x T]
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            InferMBLayoutFromInputsForStandardCase();
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
 
-            if (Input(0)->Value().GetMatrixType() == SPARSE)
-                LogicError("LSTMNode: input to LSTM has to be dense matrix. Consider adding a project layer using lookuptable before LSTM node. ");
+        if (Input(0)->Value().GetMatrixType() == SPARSE)
+            LogicError("LSTMNode: input to LSTM has to be dense matrix. Consider adding a project layer using lookuptable before LSTM node. ");
 
 #if 0
             // TODO: use dynamic_pointer_cast instead
@@ -2787,270 +2788,270 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
                 LogicError("LSTM validation: need to have learnable parameters ");
 #endif
 
-            //if (Input(0)->GetSampleMatrixNumRows() == 0)
-            //    LogicError("LSTM validation: input size is zero!");
+        //if (Input(0)->GetSampleMatrixNumRows() == 0)
+        //    LogicError("LSTM validation: input size is zero!");
 
-            //if (Input(1)->GetSampleMatrixNumRows() == 0 ||
-            //    Input(2)->GetSampleMatrixNumRows() == 0 ||
-            //    Input(3)->GetSampleMatrixNumRows() == 0 ||
-            //    Input(4)->GetSampleMatrixNumRows() == 0)
-            //    LogicError("LSTM validation : parameter size is zero!");
+        //if (Input(1)->GetSampleMatrixNumRows() == 0 ||
+        //    Input(2)->GetSampleMatrixNumRows() == 0 ||
+        //    Input(3)->GetSampleMatrixNumRows() == 0 ||
+        //    Input(4)->GetSampleMatrixNumRows() == 0)
+        //    LogicError("LSTM validation : parameter size is zero!");
 
-            size_t nindim = Input(0)->GetSampleMatrixNumRows();
-            size_t noutdim = Input(1)->GetSampleMatrixNumRows();
-            //size_t nT = Input(0)->GetSampleMatrixNumCols();
-            size_t nCol = nindim + noutdim + 2;
-            if (isFinalValidationPass)
+        size_t nindim = Input(0)->GetSampleMatrixNumRows();
+        size_t noutdim = Input(1)->GetSampleMatrixNumRows();
+        //size_t nT = Input(0)->GetSampleMatrixNumCols();
+        size_t nCol = nindim + noutdim + 2;
+        if (isFinalValidationPass)
+        {
+            if (Input(1)->GetSampleMatrixNumCols() != nCol)
             {
-                if (Input(1)->GetSampleMatrixNumCols() != nCol)
-                {
-                    LogicError("LSTM validation : dimension mismatched between child and inputGate");
-                }
-                if (Input(2)->GetSampleMatrixNumCols() != nCol)
-                {
-                    LogicError("LSTM validation : dimension mismatched between child and forgetGate");
-                }
-                if (Input(3)->GetSampleMatrixNumCols() != nCol)
-                {
-                    LogicError("LSTM validation : dimension mismatched between child and outputGate");
-                }
-
-                if (noutdim != Input(2)->GetSampleMatrixNumRows() ||
-                    noutdim != Input(3)->GetSampleMatrixNumRows() ||
-                    noutdim != Input(4)->GetSampleMatrixNumRows())
-                {
-                    LogicError("LSTM validation: output dimension mismatched!");
-                }
+                LogicError("LSTM validation : dimension mismatched between child and inputGate");
+            }
+            if (Input(2)->GetSampleMatrixNumCols() != nCol)
+            {
+                LogicError("LSTM validation : dimension mismatched between child and forgetGate");
+            }
+            if (Input(3)->GetSampleMatrixNumCols() != nCol)
+            {
+                LogicError("LSTM validation : dimension mismatched between child and outputGate");
             }
 
-            SetDims(TensorShape(noutdim), true);
-            Value().SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
+            if (noutdim != Input(2)->GetSampleMatrixNumRows() ||
+                noutdim != Input(3)->GetSampleMatrixNumRows() ||
+                noutdim != Input(4)->GetSampleMatrixNumRows())
+            {
+                LogicError("LSTM validation: output dimension mismatched!");
+            }
         }
 
-        bool UnitTest()
+        SetDims(TensorShape(noutdim), true);
+        Value().SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
+    }
+
+    bool UnitTest()
+    {
         {
-            {
-                size_t nT = 3;
-                size_t nInput = 2;
-                size_t nHidden = 3;
-                size_t nOutput = 3;
-
-                // backup 
-                Matrix<ElemType> f0(m_deviceId), f1(m_deviceId), f2(m_deviceId), f3(m_deviceId), f4(m_deviceId), func(m_deviceId), f5(m_deviceId);
-                Matrix<ElemType> target(m_deviceId);
-                Matrix<ElemType> giWeight, ghWeight, goWeight;
-                ElemType initStateValue = m_DefaultState;
-                auto pMBLayout = make_shared<MBLayout>();
-                pMBLayout->Init(1, nT);
-                //Matrix<float> & boundary = pMBLayout->m_sentenceBoundaryFlags;
-                //vector<MinibatchPackingFlags> & minibatchPackingFlags = pMBLayout->m_minibatchPackingFlags;
-                //boundary.ColumnSlice(0, 1).SetValue(((int) MinibatchPackingFlags::SequenceStart));
-                //minibatchPackingFlags[1] = MinibatchPackingFlags::SequenceStart;
-                pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, nT);
-                Base::LinkToMBLayout(pMBLayout);
-
-                f0 = Input(0)->Value();
-                f1 = Input(1)->Value();
-                f2 = Input(2)->Value();
-                f3 = Input(3)->Value();
-                f4 = Input(4)->Value();
-                func = Value();
-
-                target.Resize(nOutput, nT);
-                for (size_t i = 0; i < nT; i++)
-                    target(0, i) = 1;
-
-                Input(0)->SetDims1(nInput, nT);
-                Input(0)->Value().SetValue(ConstOnes(nInput, nT, m_deviceId));
+            size_t nT = 3;
+            size_t nInput = 2;
+            size_t nHidden = 3;
+            size_t nOutput = 3;
+
+            // backup
+            Matrix<ElemType> f0(m_deviceId), f1(m_deviceId), f2(m_deviceId), f3(m_deviceId), f4(m_deviceId), func(m_deviceId), f5(m_deviceId);
+            Matrix<ElemType> target(m_deviceId);
+            Matrix<ElemType> giWeight, ghWeight, goWeight;
+            ElemType initStateValue = m_DefaultState;
+            auto pMBLayout = make_shared<MBLayout>();
+            pMBLayout->Init(1, nT);
+            //Matrix<float> & boundary = pMBLayout->m_sentenceBoundaryFlags;
+            //vector<MinibatchPackingFlags> & minibatchPackingFlags = pMBLayout->m_minibatchPackingFlags;
+            //boundary.ColumnSlice(0, 1).SetValue(((int) MinibatchPackingFlags::SequenceStart));
+            //minibatchPackingFlags[1] = MinibatchPackingFlags::SequenceStart;
+            pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, nT);
+            Base::LinkToMBLayout(pMBLayout);
+
+            f0 = Input(0)->Value();
+            f1 = Input(1)->Value();
+            f2 = Input(2)->Value();
+            f3 = Input(3)->Value();
+            f4 = Input(4)->Value();
+            func = Value();
+
+            target.Resize(nOutput, nT);
+            for (size_t i = 0; i < nT; i++)
+                target(0, i) = 1;
+
+            Input(0)->SetDims1(nInput, nT);
+            Input(0)->Value().SetValue(ConstOnes(nInput, nT, m_deviceId));
             Input(0)->Value().SetValue((ElemType) 0.1);
-                Input(1)->SetDims1(nHidden, nInput + nOutput + 2);
+            Input(1)->SetDims1(nHidden, nInput + nOutput + 2);
             Input(1)->Value().SetValue((ElemType) 0.1);
-                Input(2)->SetDims1(nHidden, nInput + nHidden + 2);
+            Input(2)->SetDims1(nHidden, nInput + nHidden + 2);
             Input(2)->Value().SetValue((ElemType) 0.1);
-                Input(3)->SetDims1(nOutput, nInput + nHidden + 2);
+            Input(3)->SetDims1(nOutput, nInput + nHidden + 2);
             Input(3)->Value().SetValue((ElemType) 0.1);
-                Input(4)->SetDims1(nOutput, nHidden + nInput + 1);
+            Input(4)->SetDims1(nOutput, nHidden + nInput + 1);
             Input(4)->Value().SetValue((ElemType) 0.1);
-                SetDims1(nOutput, nT);
+            SetDims1(nOutput, nT);
 
-                m_DefaultState = 0.0;
-                ForwardProp(FrameRange(m_pMBLayout));
+            m_DefaultState = 0.0;
+            ForwardProp(FrameRange(m_pMBLayout));
 
-                // check with expected values
-                if (!ISCLOSE(Value()(0, 0), 0.0335975, EPSILON) ||
-                    !ISCLOSE(Value()(0, 1), 0.05485132, EPSILON) ||
-                    !ISCLOSE(Value()(0, 2), 0.06838435, EPSILON) ||
-                    !(Value()(0, 0) == Value()(1, 0)))
-                    throw("LSTMNode forward computation error");
+            // check with expected values
+            if (!ISCLOSE(Value()(0, 0), 0.0335975, EPSILON) ||
+                !ISCLOSE(Value()(0, 1), 0.05485132, EPSILON) ||
+                !ISCLOSE(Value()(0, 2), 0.06838435, EPSILON) ||
+                !(Value()(0, 0) == Value()(1, 0)))
+                throw("LSTMNode forward computation error");
 
             Value().TransferToDeviceIfNotThere(m_deviceId, true);
 
-                Gradient().Resize(nOutput, nT);
-                Gradient().SetValue(1.0);
-                for (size_t i = 0; i < 5; i++)
-                {
-                    Input(i)->Gradient().Resize(Input(i)->GetSampleMatrixNumRows(), Input(i)->GetSampleMatrixNumCols());
-                    Input(i)->Gradient().SetValue(0);
-                }
-                for (size_t i = 0; i < 5; i++)
-                    BackpropTo(i, FrameRange(m_pMBLayout));
+            Gradient().Resize(nOutput, nT);
+            Gradient().SetValue(1.0);
+            for (size_t i = 0; i < 5; i++)
+            {
+                Input(i)->Gradient().Resize(Input(i)->GetSampleMatrixNumRows(), Input(i)->GetSampleMatrixNumCols());
+                Input(i)->Gradient().SetValue(0);
+            }
+            for (size_t i = 0; i < 5; i++)
+                BackpropTo(i, FrameRange(m_pMBLayout));
 
-                // check with expected values
+            // check with expected values
             if (!ISCLOSE(Input(1)->Gradient()(0, 0), 0.07843818, EPSILON)    // bi
                 || !ISCLOSE(Input(1)->Gradient()(0, 1), 0.00784382, EPSILON) // Wxi
                 || !ISCLOSE(Input(1)->Gradient()(0, 3), 0.00192997, EPSILON) // Whi
                 || !ISCLOSE(Input(1)->Gradient()(0, 6), 0.00362767, EPSILON) // Wci
-                    )
-                    throw("LSTMNode gradient error on input gates");
+                )
+                throw("LSTMNode gradient error on input gates");
             if (!ISCLOSE(Input(2)->Gradient()(0, 0), 0.02738655, EPSILON)    // bf
                 || !ISCLOSE(Input(2)->Gradient()(0, 1), 0.00273866, EPSILON) // Wxf
                 || !ISCLOSE(Input(2)->Gradient()(0, 3), 0.00120922, EPSILON) // Whf
                 || !ISCLOSE(Input(2)->Gradient()(0, 6), 0.00227184, EPSILON) // Wcf
-                    )
-                    throw("LSTMNode gradient error on forget gates");
+                )
+                throw("LSTMNode gradient error on forget gates");
             if (!ISCLOSE(Input(3)->Gradient()(0, 0), 0.07801557, EPSILON)    // bo
                 || !ISCLOSE(Input(3)->Gradient()(0, 1), 0.00780156, EPSILON) // Wxo
                 || !ISCLOSE(Input(3)->Gradient()(0, 3), 0.00268089, EPSILON) // Who
                 || !ISCLOSE(Input(3)->Gradient()(0, 6), 0.00809852, EPSILON) // Wco
-                    )
-                    throw("LSTMNode gradient error on output gates");
+                )
+                throw("LSTMNode gradient error on output gates");
             if (!ISCLOSE(Input(4)->Gradient()(0, 0), 1.3075038, EPSILON)     // bc
                 || !ISCLOSE(Input(4)->Gradient()(0, 1), 0.13075038, EPSILON) // Wxc
                 || !ISCLOSE(Input(4)->Gradient()(0, 3), 0.03080355, EPSILON) // Whc
-                    )
-                    throw("LSTMNode gradient error on memory cells");
+                )
+                throw("LSTMNode gradient error on memory cells");
+
+            for (size_t i = 0; i < 5; i++)
+            {
 
-                for (size_t i = 0; i < 5; i++)
-                {
-                    
                 Input(i)->Gradient().TransferToDeviceIfNotThere(m_deviceId, true);
-                }
-                m_DefaultState = initStateValue;
             }
-
-            fprintf(stderr, "LSTMNode unit test passed!\n");
-            return true;
+            m_DefaultState = initStateValue;
         }
 
-        virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
-        {
-            Base::DumpNodeInfo(printValues, fstream);
+        fprintf(stderr, "LSTMNode unit test passed!\n");
+        return true;
+    }
+
+    virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
+    {
+        Base::DumpNodeInfo(printValues, fstream);
         fstream << L"Input[Width:" << m_inputDim << L"]  \n";
-            fstream << L"Hidden[Width:" << m_outputDim << L"]    Output[Width:" << m_outputDim << L"]  \n";
-        }
+        fstream << L"Hidden[Width:" << m_outputDim << L"]    Output[Width:" << m_outputDim << L"]  \n";
+    }
 
-    public:
-        bool GetHistory(Matrix<ElemType>& hist, bool bLastTime)
-        {
-            size_t tRow = m_PastOutput.GetNumRows();
-            size_t tCol = m_PastOutput.GetNumCols();
-            size_t rCol = m_PastState.GetNumCols();
+public:
+    bool GetHistory(Matrix<ElemType>& hist, bool bLastTime)
+    {
+        size_t tRow = m_PastOutput.GetNumRows();
+        size_t tCol = m_PastOutput.GetNumCols();
+        size_t rCol = m_PastState.GetNumCols();
 
-            DEVICEID_TYPE device = hist.GetDeviceId();
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
-            hist.Resize(tRow, tCol + rCol);
+        DEVICEID_TYPE device = hist.GetDeviceId();
+        hist.TransferFromDeviceToDevice(device, m_deviceId, true);
+        hist.Resize(tRow, tCol + rCol);
 
-            if (bLastTime)
-            {
-                hist.ColumnSlice(0, tCol).SetValue(mLastOutput);
-                hist.ColumnSlice(tCol, rCol).SetValue(mLastState);
-            }
+        if (bLastTime)
+        {
+            hist.ColumnSlice(0, tCol).SetValue(mLastOutput);
+            hist.ColumnSlice(tCol, rCol).SetValue(mLastState);
+        }
         else
         {
-                hist.ColumnSlice(0, tCol).SetValue(m_PastOutput);
-                hist.ColumnSlice(tCol, rCol).SetValue(m_PastState);
-            }
-
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-            return true;
+            hist.ColumnSlice(0, tCol).SetValue(m_PastOutput);
+            hist.ColumnSlice(tCol, rCol).SetValue(m_PastState);
         }
 
-        void SetHistory(const Matrix<ElemType>& hist)
-        {
-            size_t tRow = hist.GetNumRows();
-            size_t tCol = hist.GetNumCols();
-            size_t eCols = tCol / 2;
+        hist.TransferFromDeviceToDevice(m_deviceId, device, true);
+        return true;
+    }
+
+    void SetHistory(const Matrix<ElemType>& hist)
+    {
+        size_t tRow = hist.GetNumRows();
+        size_t tCol = hist.GetNumCols();
+        size_t eCols = tCol / 2;
 
-            DEVICEID_TYPE device = hist.GetDeviceId();
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
+        DEVICEID_TYPE device = hist.GetDeviceId();
+        hist.TransferFromDeviceToDevice(device, m_deviceId, true);
 
-            m_PastOutput.Resize(tRow, eCols);
-            m_PastState.Resize(tRow, eCols);
-            m_PastOutput.SetValue(hist.ColumnSlice(0, eCols));
-            m_PastState.SetValue(hist.ColumnSlice(eCols, eCols));
+        m_PastOutput.Resize(tRow, eCols);
+        m_PastState.Resize(tRow, eCols);
+        m_PastOutput.SetValue(hist.ColumnSlice(0, eCols));
+        m_PastState.SetValue(hist.ColumnSlice(eCols, eCols));
 
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-        }
+        hist.TransferFromDeviceToDevice(m_deviceId, device, true);
+    }
 
-        virtual void GetErrorsToPreviousMinibatch(Matrix<ElemType>& hist)
-        {
-            size_t tRow = m_obs_error_from_future_minibatch.GetNumRows();
-            size_t tCol = m_obs_error_from_future_minibatch.GetNumCols();
-            size_t rCol = m_state_error_from_future_minibatch.GetNumCols();
+    virtual void GetErrorsToPreviousMinibatch(Matrix<ElemType>& hist)
+    {
+        size_t tRow = m_obs_error_from_future_minibatch.GetNumRows();
+        size_t tCol = m_obs_error_from_future_minibatch.GetNumCols();
+        size_t rCol = m_state_error_from_future_minibatch.GetNumCols();
 
-            DEVICEID_TYPE device = hist.GetDeviceId();
+        DEVICEID_TYPE device = hist.GetDeviceId();
 
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
-            hist.Resize(tRow, tCol + rCol);
+        hist.TransferFromDeviceToDevice(device, m_deviceId, true);
+        hist.Resize(tRow, tCol + rCol);
 
-            hist.ColumnSlice(0, tCol).SetValue(m_obs_error_from_future_minibatch);
-            hist.ColumnSlice(tCol, rCol).SetValue(m_state_error_from_future_minibatch);
+        hist.ColumnSlice(0, tCol).SetValue(m_obs_error_from_future_minibatch);
+        hist.ColumnSlice(tCol, rCol).SetValue(m_state_error_from_future_minibatch);
 
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-        }
+        hist.TransferFromDeviceToDevice(m_deviceId, device, true);
+    }
 
-        virtual void SetErrorsFromFutureMinibatch(Matrix<ElemType>& hist)
-        {
-            size_t tCol = hist.GetNumCols();
-            size_t rCol = tCol / 2;
+    virtual void SetErrorsFromFutureMinibatch(Matrix<ElemType>& hist)
+    {
+        size_t tCol = hist.GetNumCols();
+        size_t rCol = tCol / 2;
 
-            DEVICEID_TYPE device = hist.GetDeviceId();
+        DEVICEID_TYPE device = hist.GetDeviceId();
 
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
+        hist.TransferFromDeviceToDevice(device, m_deviceId, true);
 
-            m_obs_error_from_future_minibatch.SetValue(hist.ColumnSlice(0, rCol));
-            m_state_error_from_future_minibatch.SetValue(hist.ColumnSlice(rCol, rCol));
+        m_obs_error_from_future_minibatch.SetValue(hist.ColumnSlice(0, rCol));
+        m_state_error_from_future_minibatch.SetValue(hist.ColumnSlice(rCol, rCol));
 
-            m_use_errors_from_future_minibatch = true;
+        m_use_errors_from_future_minibatch = true;
 
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-        }
+        hist.TransferFromDeviceToDevice(m_deviceId, device, true);
+    }
 
-    protected:
-        size_t m_inputDim;
-        size_t m_outputDim;
+protected:
+    size_t m_inputDim;
+    size_t m_outputDim;
 
     Matrix<ElemType> m_State;      // hidden state activity
     Matrix<ElemType> m_PastState;  // state activity in the previous minibatch
-        Matrix<ElemType> m_PastOutput; // output in the previou minibatch 
+    Matrix<ElemType> m_PastOutput; // output in the previou minibatch
 
     Matrix<ElemType> mLastState;  // last state activity
-        Matrix<ElemType> mLastOutput; // last output 
+    Matrix<ElemType> mLastOutput; // last output
 
     Matrix<ElemType> m_Gi; // input gate activity
     Matrix<ElemType> m_Gf; // forget gate activity
     Matrix<ElemType> m_Go; // output gate activity
 
-        Matrix<ElemType> grdToObs, grdToInputGate, grdToForgetGate, grdToOutputGate, grdToCellWgt;
-        Matrix<ElemType> tanhState, tanhObs;
+    Matrix<ElemType> grdToObs, grdToInputGate, grdToForgetGate, grdToOutputGate, grdToCellWgt;
+    Matrix<ElemType> tanhState, tanhObs;
 
-        Matrix<ElemType> m_tempMatrix; // temp matrix for speed-up
+    Matrix<ElemType> m_tempMatrix; // temp matrix for speed-up
 
     bool m_GradientComputed; // true if LSTM node has computed gradients, set to false if forward computation is just finished
 
-        Matrix<ElemType> mSlicePrevOutput, mSlicePrevState;
+    Matrix<ElemType> mSlicePrevOutput, mSlicePrevState;
 
-        Matrix<ElemType> grdBeforeInputGate, grdBeforeForget, grdBeforeGo, grdToCell, grdBeforeTanhInputGate;
+    Matrix<ElemType> grdBeforeInputGate, grdBeforeForget, grdBeforeGo, grdToCell, grdBeforeTanhInputGate;
 
-    public:
-        // errors from future minibatch
-        Matrix<ElemType> m_obs_error_from_future_minibatch;
-        Matrix<ElemType> m_state_error_from_future_minibatch;
-        bool m_use_errors_from_future_minibatch;
+public:
+    // errors from future minibatch
+    Matrix<ElemType> m_obs_error_from_future_minibatch;
+    Matrix<ElemType> m_state_error_from_future_minibatch;
+    bool m_use_errors_from_future_minibatch;
 
-        ElemType m_DefaultState;
-    };
+    ElemType m_DefaultState;
+};
 
-    template class LSTMNode<float>;
-    template class LSTMNode<double>;
+template class LSTMNode<float>;
+template class LSTMNode<double>;
 } } }
diff --git a/Source/ComputationNetworkLib/EvaluationCriterionNodes.h b/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
index 6f91e125293c..cb920b2dbd85 100644
--- a/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
@@ -130,5 +130,4 @@ class ErrorPredictionNode : public ComputationNodeNonLooping /*ComputationNode*/
 
 template class ErrorPredictionNode<float>;
 template class ErrorPredictionNode<double>;
-
 } } }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index c2c4f97b11f5..825ba1ee5168 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -113,12 +113,12 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
         else
         {
             sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
-            if (cols > 1)   // in some legacy format, last tensor dimension was split off as an explicit column dimension
+            if (cols > 1) // in some legacy format, last tensor dimension was split off as an explicit column dimension
                 sampleLayout.AppendInPlace(sampleLayout.GetRank(), cols);
         }
         LoadValue(fstream);
-        SetDims(sampleLayout, false);   // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
-        VerifyDataSize(Value());        // sanity check
+        SetDims(sampleLayout, false); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
+        VerifyDataSize(Value());      // sanity check
     }
 
     // initialize with random numbers
@@ -132,10 +132,10 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
         // the random seed offset is set via the "randomSeedOffset" parameter in config
         if (initOnCPUOnly)
             Value().TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
-#if 1   // this more complex version is needed to repro test cases generated with an older version
-        auto & value = GetSampleLayout().GetRank() > 2 ? Value() : ValueAsMatrix();
+#if 1 // this more complex version is needed to repro test cases generated with an older version
+        auto& value = GetSampleLayout().GetRank() > 2 ? Value() : ValueAsMatrix();
 #else
-        auto & value = Value();
+        auto& value = Value();
 #endif
         if (uniformInit)
         {
@@ -232,8 +232,8 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
         if (isSparse)
             ConvertToSparseMatrix();
 
-        SetDims(sampleLayout, HasMBLayout());   // also called when reloading a file. Then we have an MBLayout, otherwise not yet
-        UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
+        SetDims(sampleLayout, HasMBLayout()); // also called when reloading a file. Then we have an MBLayout, otherwise not yet
+        UpdateFunctionValuesSize();           // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
         m_parameterUpdateRequired = false;
         this->m_valueSharable = false;
     }
@@ -267,7 +267,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
     virtual void Save(File& fstream) const override
     {
         Base::Save(fstream);
-        size_t rowsDummy = 0;               // compat with old file format
+        size_t rowsDummy = 0; // compat with old file format
         size_t colsDummy = 0;
         fstream << rowsDummy << colsDummy;
         m_sampleLayout.Save(fstream);
@@ -282,7 +282,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
         TensorShape sampleLayout;
         sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
         // some older files may have inconsistent tensor information
-        if (rows != 0/*old file*/ && rows != sampleLayout.GetNumElements()/*even older file*/)
+        if (rows != 0 /*old file*/ && rows != sampleLayout.GetNumElements() /*even older file*/)
         {
             fprintf(stderr, "WARNING: %ls InputValue has inconsistent serialized sample layout %s vs. number of rows %d. Resetting sample layout to vector.\n",
                     NodeName().c_str(), string(sampleLayout).c_str(), (int) rows);
@@ -501,7 +501,7 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
         if (isFinalValidationPass && Input(1)->GetSampleMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0)
             InvalidArgument("Mismatched dimension. Rows in input1 must be multiples of cols in input0.");
 
-        size_t wordsInEachSample = Input(1)->GetSampleMatrixNumRows() / Input(0)->GetAsMatrixNumCols()/*note: can never be 0*/;
+        size_t wordsInEachSample = Input(1)->GetSampleMatrixNumRows() / Input(0)->GetAsMatrixNumCols() /*note: can never be 0*/;
 
         // TODO: Should this add a tensor dimension?
         SetDims(TensorShape(Input(0)->GetAsMatrixNumRows() * wordsInEachSample), true);
diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
index 285a42974655..421bde27317d 100644
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@@ -204,24 +204,24 @@ class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        if (inputIndex == 0)        // left derivative
+        if (inputIndex == 0) // left derivative
         {
             // this potentially computes inner products over time, so we use the Masked- variants
             auto sliceOutputGrad = MaskedGradientFor(fr);
             auto sliceInput1Value = Input(1)->MaskedValueFor(fr);
-            auto & input0Grad = Input(0)->GradientAsMatrix();
+            auto& input0Grad = Input(0)->GradientAsMatrix();
 
             // currently we only support one combination when the input is sparse.
             if (sliceInput1Value.GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && sliceOutputGrad.GetMatrixType() == DENSE)
                 Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
 
-            bool transpose = m_transpose;   // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
+            bool transpose = m_transpose; // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
             if (!transpose)
                 Matrix<ElemType>::MultiplyAndAdd(sliceOutputGrad, false, sliceInput1Value, true, input0Grad);
             else
                 Matrix<ElemType>::MultiplyAndAdd(sliceInput1Value, false, sliceOutputGrad, true, input0Grad);
         }
-        else                        // right derivative
+        else // right derivative
         {
             auto sliceInput1Grad = Input(1)->GradientFor(fr);
             auto sliceOutputGrad = GradientFor(fr);
@@ -264,7 +264,7 @@ class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
 
         // support automatic dimension inference for learnable parameters
         size_t rows0 = Input(0)->GetAsMatrixNumRows(), cols0 = Input(0)->GetAsMatrixNumCols();
-        bool transpose = m_transpose;   // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
+        bool transpose = m_transpose; // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
         if (transpose)
             std::swap(rows0, cols0);
         size_t rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
@@ -281,7 +281,7 @@ class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
             Input(1)->ValidateInferInputDimsFrom(TensorShape(cols0));
             SetDims(TensorShape(rows0), true);
         }
-        else    // multiplying two straight matrices
+        else // multiplying two straight matrices
         {
             size_t cols1 = Input(1)->GetAsMatrixNumCols();
             // infer rows1 as cols0
@@ -293,7 +293,7 @@ class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
         cols0 = m_transpose ? Input(0)->GetAsMatrixNumRows() : Input(0)->GetAsMatrixNumCols();
         rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
         if (isFinalValidationPass && cols0 != rows1)
-            InvalidArgument("The inner matrix dimension in the %ls Times operation does not match (%d vs. %d).", NodeName().c_str(), (int)rows1, (int)cols0);
+            InvalidArgument("The inner matrix dimension in the %ls Times operation does not match (%d vs. %d).", NodeName().c_str(), (int) rows1, (int) cols0);
     }
 
     virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
@@ -325,6 +325,7 @@ class TimesNode : public TimesNodeBase<ElemType, false>
     {
         return L"Times";
     }
+
 public:
     DeclareConstructorFromConfigWithNumInputs(TimesNode);
     TimesNode(DEVICEID_TYPE deviceId, const wstring& name)
@@ -351,6 +352,7 @@ class TransposeTimesNode : public TimesNodeBase<ElemType, true>
     {
         return L"TransposeTimes";
     }
+
 public:
     DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
     TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
@@ -497,7 +499,7 @@ class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
             Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0));
             SetDims(TensorShape(rows0), true);
         }
-        else    // multiplying two straight matrices
+        else // multiplying two straight matrices
         {
             size_t cols1 = Input(1)->GetAsMatrixNumCols();
             // infer rows1 as rows0
@@ -509,7 +511,7 @@ class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
         rows0 = Input(0)->GetAsMatrixNumRows();
         rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
         if (isFinalValidationPass && rows0 != rows1)
-            InvalidArgument("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int)rows1, (int)rows0);
+            InvalidArgument("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int) rows1, (int) rows0);
         size_t cols0 = Input(0)->GetAsMatrixNumCols();
         if (isFinalValidationPass && cols0 != 1)
             InvalidArgument("The first matrix should be a column vector representing the diagonal of a square matrix in the DiagTimes operation.");
@@ -659,7 +661,7 @@ class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
         auto sliceInputValue = Input(0)->ValueFor(fr);
-        auto sliceOutputValue = ValueFor(fr);   // row vector
+        auto sliceOutputValue = ValueFor(fr); // row vector
 
         Matrix<ElemType>::VectorSum(sliceInputValue, sliceOutputValue, true);
     }
@@ -700,8 +702,8 @@ class TransposeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemT
 
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override
     {
-        auto & inputGradientValues = Input(0)->GradientAsMatrix();
-        auto & gradientValues = GradientAsMatrix();
+        auto& inputGradientValues = Input(0)->GradientAsMatrix();
+        auto& gradientValues = GradientAsMatrix();
 #if DUMPOUTPUT
         gradientValues.Print("Gradient-in");
         inputGradientValues.Print("child Gradient-in/out");
@@ -805,7 +807,7 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
-        Input(0)->ValueAsMatrix().AssignDiagonalValuesTo(ValueAsMatrix());  // TODO: use tensor lib; this is a stride operation
+        Input(0)->ValueAsMatrix().AssignDiagonalValuesTo(ValueAsMatrix()); // TODO: use tensor lib; this is a stride operation
 #if NANCHECK
         Value().HasNan("Diagonal");
 #endif
@@ -813,8 +815,8 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
 
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override
     {
-        auto & inputGradientValues = Input(0)->GradientAsMatrix();
-        auto & gradientValues = GradientAsMatrix();
+        auto& inputGradientValues = Input(0)->GradientAsMatrix();
+        auto& gradientValues = GradientAsMatrix();
 
         // BUGBUG: This should use the memshare mechanism.
         // TODO: use tensor lib, then this will be easy, no memsharing needed
@@ -873,9 +875,9 @@ class CosDistanceNode : public ComputationNode<ElemType>, public NumInputs<2>
     {
         // functionValues, invNorm0, invNorm1 - output from the EvaluateNode() method
         // temp, rightTerm, leftTerm - temporary matrices
-        if (inputIndex == 0)    // left derivative
+        if (inputIndex == 0) // left derivative
             m_temp->AssignElementProductOf(*m_invNorm0, *m_invNorm0);
-        else                    // right derivative
+        else // right derivative
             m_temp->AssignElementProductOf(*m_invNorm1, *m_invNorm1);
 
         m_temp->ElementMultiplyWith(ValueFor(fr));
@@ -1226,8 +1228,7 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode<ElemType>, pub
         ValidateInferBinaryInputDims();
 
         if (isFinalValidationPass &&
-            (Input(0)->GetSampleMatrixNumRows() != Input(1)->GetSampleMatrixNumRows()
-             || Input(0)->GetMBLayout() != Input(1)->GetMBLayout()))
+            (Input(0)->GetSampleMatrixNumRows() != Input(1)->GetSampleMatrixNumRows() || Input(0)->GetMBLayout() != Input(1)->GetMBLayout()))
         {
             LogicError("The tensor dimension in the %ls %ls operation does not match.", NodeName().c_str(), OperationName().c_str());
         }
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index e464b613a17a..8e6bc55aae28 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -407,7 +407,7 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
             else
             {
                 Matrix<ElemType> sliceUnnormedPriorGradient = Input(0)->GradientFor(fr);
-                Matrix<ElemType> slicePrior = DataFor(*m_prior, fr);    // TODO: use the right MBLayout, then we won't need the special case
+                Matrix<ElemType> slicePrior = DataFor(*m_prior, fr); // TODO: use the right MBLayout, then we won't need the special case
                 BackpropToUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, *m_temp);
             }
         }
@@ -546,7 +546,7 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
 
         size_t numCols = Input(3)->GetSampleMatrixNumCols();
         size_t numComponents = Input(0)->GetSampleMatrixNumRows();
-        size_t colsPrior = Input(0)->GetSampleMatrixNumCols();  // may be 1
+        size_t colsPrior = Input(0)->GetSampleMatrixNumCols(); // may be 1
         size_t featureSize = Input(3)->GetSampleMatrixNumRows();
 
         m_prior->Resize(numComponents, colsPrior);
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index 548e34d159e9..39b2402949f0 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -383,14 +383,14 @@ class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, publi
         if (isTimeIteration)
         {
             ForAllBoundaryIntersectingSequences(fr, outSliceLogical, T, [&](const MBLayout::SequenceInfo& toSeqInfo)
-            {
-                // determine FrameRanges for from and to
-                FrameRange frFrom, frTo;
-                DetermineBoundaryFrameRanges(fr, toSeqInfo, fromNode, frFrom, T, frTo);
-
-                // copy/backprop
-                Propagate(fromNode, fromShape, frFrom, outShape, frTo, isForward, +1);
-            });
+                                                {
+                                                    // determine FrameRanges for from and to
+                                                    FrameRange frFrom, frTo;
+                                                    DetermineBoundaryFrameRanges(fr, toSeqInfo, fromNode, frFrom, T, frTo);
+
+                                                    // copy/backprop
+                                                    Propagate(fromNode, fromShape, frFrom, outShape, frTo, isForward, +1);
+                                                });
         }
         // iterating over fixed sample-shape dimensions
         else if (!isTimeIteration && (inSliceLogical.first[m_shiftDim] < 0 || inSliceLogical.second[m_shiftDim] >= T))
@@ -491,8 +491,8 @@ class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, publi
             if (inSliceMain.second[m_shiftDim] > inSliceMain.first[m_shiftDim])
             {
                 Input(0)->MaskMissingGradientColumnsToZero(fr); // zero out gaps, which will leak (note: we really only need to zero out gaps close enough to boundaries)
-                auto from = DataTensorFor(Input(0)->Gradient(),  inShape,  inSliceMain);
-                auto to =   DataTensorFor(          Gradient(), outShape, outSliceMain);
+                auto from = DataTensorFor(Input(0)->Gradient(), inShape, inSliceMain);
+                auto to = DataTensorFor(Gradient(), outShape, outSliceMain);
                 from.AddCopyOf(to);
 
                 // We have now propagated anything from within the logical bounds.
@@ -513,16 +513,16 @@ class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, publi
                 if (isTimeIteration)
                 {
                     ForAllBoundaryIntersectingSequences(fr, outSliceMain /*already clipped*/, T, [&](const MBLayout::SequenceInfo& toSeqInfo)
-                    {
-                        // determine FrameRanges for from and to
-                        FrameRange frTo;
-                        DetermineBoundaryToFrameRange(fr, toSeqInfo, T, frTo);
-                        FrameRange frFrom = frTo.WithTimeOffset(m_fromOffset);
-                        assert((int) frFrom.timeIdxInSeq + frFrom.m_timeOffset >= 0 && (int) frFrom.timeIdxInSeq + frFrom.m_timeOffset + (int) frFrom.m_timeRange <= (int) T);
-
-                        // copy/backprop
-                        Propagate(shared_from_this(), inShape, frFrom, outShape, frTo, /*isForward=*/false, -1 /*subtract*/);
-                    });
+                                                        {
+                                                            // determine FrameRanges for from and to
+                                                            FrameRange frTo;
+                                                            DetermineBoundaryToFrameRange(fr, toSeqInfo, T, frTo);
+                                                            FrameRange frFrom = frTo.WithTimeOffset(m_fromOffset);
+                                                            assert((int) frFrom.timeIdxInSeq + frFrom.m_timeOffset >= 0 && (int) frFrom.timeIdxInSeq + frFrom.m_timeOffset + (int) frFrom.m_timeRange <= (int) T);
+
+                                                            // copy/backprop
+                                                            Propagate(shared_from_this(), inShape, frFrom, outShape, frTo, /*isForward=*/false, -1 /*subtract*/);
+                                                        });
                 }
             }
         }
@@ -733,7 +733,7 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         m_initialActivationValue = initialActivationValue;
         m_timeStep = 1;
         CreateMatrixIfNull(m_value);
-        SetDims(sampleLayout, HasMBLayout()/*false at this point*/);
+        SetDims(sampleLayout, HasMBLayout() /*false at this point*/);
         m_value->SetValue(m_initialActivationValue); // is this needed?
     }
 
@@ -778,7 +778,7 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
 
         fstream << m_timeStep;
         size_t colsDummy = 0;
-        fstream << GetSampleMatrixNumRows() << colsDummy;   // #rows saved for legacy file format
+        fstream << GetSampleMatrixNumRows() << colsDummy; // #rows saved for legacy file format
 
         fstream << m_initialActivationValue;
     }
@@ -790,11 +790,11 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
 
         fstream >> m_timeStep;
 
-            size_t rows, colsDummy;
-            fstream >> rows >> colsDummy;
+        size_t rows, colsDummy;
+        fstream >> rows >> colsDummy;
 
-        SetDims(TensorShape(rows), HasMBLayout()/*may be true on reload (roll-back)*/);  // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
-        m_delayedValue.Resize(rows, 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
+        SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
+        m_delayedValue.Resize(rows, 0);                                                  // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
 
         if (modelVersion >= CNTK_MODEL_VERSION_2)
             fstream >> m_initialActivationValue;
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index e527df655bca..ee39978673c7 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -252,7 +252,7 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
             {
                 if ((m_numTargetRows > rows && m_numTargetRows % rows != 0) || // grouping columns
                     (m_numTargetRows < rows && rows % m_numTargetRows != 0))   // splitting columns
-                    InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int)m_numTargetRows, (int)rows);
+                    InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int) m_numTargetRows, (int) rows);
                 if (rows * cols != m_numTargetRows * newCols)
                     LogicError("%ls %ls operation: unexpected dimension mismatch", NodeName().c_str(), OperationName().c_str());
             }
@@ -784,8 +784,7 @@ class RowSliceNode : public ComputationNode<ElemType>, public NumInputs<1>
 
         // RowSlice cannot slice tensors.
         // TODO: Create a TensorSlice operation, or just Slice.
-        if (isFinalValidationPass && Input(0)->HasSampleLayout()
-            && !Input(0)->GetSampleLayout().IsVectorStoredAsImage() // legacy
+        if (isFinalValidationPass && Input(0)->HasSampleLayout() && !Input(0)->GetSampleLayout().IsVectorStoredAsImage() // legacy
             )
             RuntimeError("%ls %ls operation: Input must be a vector, tensor shape [%s] not allowed.", NodeName().c_str(), OperationName().c_str(), string(Input(0)->GetSampleLayout()).c_str());
         SetDims(TensorShape(m_sliceHeight), HasMBLayout());
diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index f1279ab23349..80f43bec74d7 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -822,7 +822,6 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
     }
 
 private:
-
     void ComputeCEPartialToSoftmaxInputs(Matrix<ElemType>& inputGradientValues, Matrix<ElemType>& gradientValues, size_t y_t)
     {
         Matrix<ElemType>::MinusOneAt(inputGradientValues, y_t);
@@ -869,7 +868,6 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
     }
 
 public:
-
     virtual void UpdateFunctionMBSize() override
     {
         // TODO: Resize temp matrices here (not doing so does not really fail since for full matrices, class Matrix will resize by itself)
@@ -882,7 +880,7 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
             LogicError("ClassBasedCrossEntropyWithSoftmax (ForwardPropNonLooping()): The label matrix is not using CPU device. This will make computation slow, even though the label data is probably saved on GPU. Because of the external loop over time with explicit class id retrieved from the label matrix, the computation will be very slow if the label matrix is saved on GPU. However, this is only a constraint for label matrix and other matrices such as data are suggested to reside on GPU. ");
         // TODO: Get the label matrix into location=Both state.
 
-        auto & functionValues = Value();
+        auto& functionValues = Value();
 
         const size_t hdSize = Input(INPUTDATA)->GetSampleMatrixNumRows(); // hdSize
         assert(m_nbrCls == Input(CLASSPROBINDATA)->GetSampleMatrixNumRows());
@@ -943,7 +941,7 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
                 Matrix<ElemType> weightForClass = Input(EMBEDDINGMATRIX)->ValueAsMatrix().ColumnSlice(lft_bnd, nbr_wrd); // [hdSize x nbr_wrd]
 
                 // buffer to hold the class-conditional distribution
-                Matrix<ElemType> softMax_t = m_softMax.ColumnSlice(sz, nbr_wrd);        // TODO: declare these outside of the loop to avoid the malloc
+                Matrix<ElemType> softMax_t = m_softMax.ColumnSlice(sz, nbr_wrd); // TODO: declare these outside of the loop to avoid the malloc
                 Matrix<ElemType> logSoftMax_t = m_logSoftmax.ColumnSlice(sz, nbr_wrd);
 
                 Matrix<ElemType> obs = Input(INPUTDATA)->ValueFor(fr); // hidden activation vector for current word token
@@ -1120,7 +1118,7 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
             for (size_t i = 0; i < nS; i++) // process all sequences one by one
             {
                 FrameRange sequenceRange = fr.Sequence(i); // FrameRange to select one sequence
-                auto & gradient = Input(2)->GradientAsMatrix();
+                auto& gradient = Input(2)->GradientAsMatrix();
                 TransGrdCompute(Input(0)->ValueFor(sequenceRange),
                                 DataWithMBLayoutFor(mAlpha, sequenceRange, Input(0)->GetMBLayout()),
                                 DataWithMBLayoutFor(mBeta, sequenceRange, Input(0)->GetMBLayout()),
@@ -1702,7 +1700,7 @@ class LogisticNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemTy
 
             if (isFinalValidationPass &&
                 !(Input(0)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() &&
-                 (Input(0)->GetMBLayout() == Input(2)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(0)->HasMBLayout())))
+                  (Input(0)->GetMBLayout() == Input(2)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(0)->HasMBLayout())))
             {
                 LogicError("The Matrix dimensions of the second argument weights the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
             }
diff --git a/Source/EvalDll/CNTKEval.h b/Source/EvalDll/CNTKEval.h
index c3bec9554f67..a0771e83cf5b 100644
--- a/Source/EvalDll/CNTKEval.h
+++ b/Source/EvalDll/CNTKEval.h
@@ -58,6 +58,4 @@ class CNTKEval : public IEvaluateModel<ElemType>
     virtual void Destroy();
     virtual void ResetState();
 };
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 8d708510aa3e..92e2362580f7 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -3639,11 +3639,11 @@ void CPUMatrix<ElemType>::Print(const char* matrixName, size_t rowFirst, size_t
             fprintf(stderr, "...\t");
         for (size_t j = colFirst; j <= colLast; j++)
             fprintf(stderr, "%.10f\t", us(i, j));
-        if (colLast < GetNumCols()-1)
+        if (colLast < GetNumCols() - 1)
             fprintf(stderr, "...\t");
         fprintf(stderr, "\n");
     }
-    if (rowLast < GetNumRows()-1)
+    if (rowLast < GetNumRows() - 1)
         fprintf(stderr, "...\n");
 }
 
diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h
index b3c99adbe630..aea8716282cc 100644
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@@ -49,18 +49,17 @@ class MATH_API TracingGPUMemoryAllocator
     static void SetTraceLevel(int traceLevel);
     static bool IsTraceEnabled();
 
-    template<typename AllocatedElemType>
+    template <typename AllocatedElemType>
     static AllocatedElemType* Allocate(int deviceId, size_t numRows, size_t numCols);
 
-    template<typename AllocatedElemType>
+    template <typename AllocatedElemType>
     static AllocatedElemType* Allocate(int deviceId, size_t numElements);
 
-    template<typename AllocatedElemType>
+    template <typename AllocatedElemType>
     static void Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode = false);
 
 private:
-
-    template<typename AllocatedElemType>
+    template <typename AllocatedElemType>
     static AllocatedElemType* AllocateNoTrace(int deviceId, size_t numElements);
 
     static std::pair<size_t, size_t> GetFreeAndTotalMemoryInMBs(int deviceId);
@@ -161,7 +160,7 @@ enum ElementWiseOperator
     Macro(ElementwiseProductWithTanhDerivativeFromOutput);            \
     Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
     Macro(ElementwiseProductWithLogDerivativeFromOutput);             \
-    Macro(ElementwiseProductWithCosDerivative);                       \
+    Macro(ElementwiseProductWithCosDerivative); \
 //Macro(Index);
 
 #define ForAllTernaryOps(Macro) \
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 1dd56c5b311a..9c030a427a30 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -108,7 +108,7 @@ const char* CudaErrString<curandStatus>(curandStatus)
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-template<typename AllocatedElemType>
+template <typename AllocatedElemType>
 AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols)
 {
     AllocatedElemType* deviceBufferPtr = AllocateNoTrace<AllocatedElemType>(deviceId, numRows * numCols);
@@ -116,14 +116,14 @@ AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numR
     if (IsTraceEnabled())
     {
         auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId);
-        fprintf(stderr, "Allocated Matrix<%s> (Rows = %d, Cols = %d) buffer on DeviceId = %d, DeviceBufferPointer = %p; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numRows, (int)numCols, (int)deviceId, (void*)deviceBufferPtr, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second);
+        fprintf(stderr, "Allocated Matrix<%s> (Rows = %d, Cols = %d) buffer on DeviceId = %d, DeviceBufferPointer = %p; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int) numRows, (int) numCols, (int) deviceId, (void*) deviceBufferPtr, (int) freeAndTotalMemory.first, (int) freeAndTotalMemory.second);
         Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
     }
 
     return deviceBufferPtr;
 }
 
-template<typename AllocatedElemType>
+template <typename AllocatedElemType>
 AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numElements)
 {
     AllocatedElemType* deviceBufferPtr = AllocateNoTrace<AllocatedElemType>(deviceId, numElements);
@@ -131,37 +131,37 @@ AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numE
     if (IsTraceEnabled())
     {
         auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId);
-        fprintf(stderr, "Allocated array<%s> (NumElements = %d) on DeviceId = %d, DeviceBufferPointer = %p; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numElements, (int)deviceId, (void*)deviceBufferPtr, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second);
+        fprintf(stderr, "Allocated array<%s> (NumElements = %d) on DeviceId = %d, DeviceBufferPointer = %p; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int) numElements, (int) deviceId, (void*) deviceBufferPtr, (int) freeAndTotalMemory.first, (int) freeAndTotalMemory.second);
         Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
     }
 
     return deviceBufferPtr;
 }
 
-template<typename AllocatedElemType>
+template <typename AllocatedElemType>
 void TracingGPUMemoryAllocator::Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode /*= false*/)
 {
     PrepareDevice(deviceId);
     if (ignoreCUDARetCode)
-        cudaFree((void*)bufferPtr);
+        cudaFree((void*) bufferPtr);
     else
-        CUDA_CALL(cudaFree((void*)bufferPtr));
+        CUDA_CALL(cudaFree((void*) bufferPtr));
 
     if (IsTraceEnabled())
     {
         auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId);
-        fprintf(stderr, "Freed buffer<%s> DeviceBufferPointer = %p on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (void*)bufferPtr, (int)deviceId, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second);
+        fprintf(stderr, "Freed buffer<%s> DeviceBufferPointer = %p on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (void*) bufferPtr, (int) deviceId, (int) freeAndTotalMemory.first, (int) freeAndTotalMemory.second);
         Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
     }
 }
 
-template<typename AllocatedElemType>
+template <typename AllocatedElemType>
 AllocatedElemType* TracingGPUMemoryAllocator::AllocateNoTrace(int deviceId, size_t numElements)
 {
     AllocatedElemType* deviceBufferPtr;
 
     PrepareDevice(deviceId);
-    CUDA_CALL(cudaMalloc((void**)&deviceBufferPtr, sizeof(AllocatedElemType) * numElements));
+    CUDA_CALL(cudaMalloc((void**) &deviceBufferPtr, sizeof(AllocatedElemType) * numElements));
 
     return deviceBufferPtr;
 }
@@ -173,11 +173,11 @@ std::pair<size_t, size_t> TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(
     size_t free, total;
     auto result = cudaMemGetInfo(&free, &total);
     if (result != cudaSuccess)
-        return { size_t(0), size_t(0) };
+        return {size_t(0), size_t(0)};
     else
     {
         size_t numBytesPerMB = 1 << 20;
-        return { free / numBytesPerMB, total / numBytesPerMB};
+        return {free / numBytesPerMB, total / numBytesPerMB};
     }
 }
 
@@ -607,8 +607,8 @@ void GPUMatrix<ElemType>::Clear()
     {
         if (m_computeDevice >= 0)
         {
-            // BUG: We do not check the CUDA return code for cudaFree here since this may get called 
-            // during processExit when cudaFree will fail. The destruction of CUDA objects during 
+            // BUG: We do not check the CUDA return code for cudaFree here since this may get called
+            // during processExit when cudaFree will fail. The destruction of CUDA objects during
             // process exit must be avoided
             TracingGPUMemoryAllocator::Free<ElemType>(m_computeDevice, m_pArray, true /*ignoreCUDARetCode*/);
             m_pArray = NULL;
@@ -5004,7 +5004,6 @@ template void TracingGPUMemoryAllocator::Free<size_t>(int, size_t*, bool);
 template void TracingGPUMemoryAllocator::Free<char>(int, char*, bool);
 template void TracingGPUMemoryAllocator::Free<float>(int, float*, bool);
 template void TracingGPUMemoryAllocator::Free<double>(int, double*, bool);
-
 }
 }
 }
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 7346f2fe2a1a..7688de10921d 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -116,11 +116,11 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
     static cublasHandle_t s_cuHandle[MaxGpus];
     static void* s_curandGenerator;
 
-    // Have to use disable the warning to avoid issues with __declspec(dllexport) on Windows (C4251).
-    // Also, NVCC FE corresponding warning has to be disabled, see MathCUDA.vcxproj.
-    // The only workaround is to use naked pointer.
+// Have to use disable the warning to avoid issues with __declspec(dllexport) on Windows (C4251).
+// Also, NVCC FE corresponding warning has to be disabled, see MathCUDA.vcxproj.
+// The only workaround is to use naked pointer.
 #pragma warning(push)
-#pragma warning(disable: 4251)
+#pragma warning(disable : 4251)
     mutable std::unique_ptr<conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>> m_workspace;
 #pragma warning(pop)
 
diff --git a/Source/Math/MatrixQuantizerCPU.h b/Source/Math/MatrixQuantizerCPU.h
index 190cc3fa71e2..53902bb2d718 100644
--- a/Source/Math/MatrixQuantizerCPU.h
+++ b/Source/Math/MatrixQuantizerCPU.h
@@ -34,6 +34,4 @@ class MatrixQuantizerCPU final : public MatrixQuantizerImpl<ElemType>
     void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false) override;
     void WaitUnquantizeAsyncDone() override;
 };
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index 0b721c1b8710..852559f6e841 100644
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -2182,7 +2182,6 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
 
 template class CuDnnConvolutionEngineFactory<float>;
 template class CuDnnConvolutionEngineFactory<double>;
-
 }
 }
 }
diff --git a/Source/Math/QuantizedMatrix.h b/Source/Math/QuantizedMatrix.h
index 8f06f8028064..ff41be05d4f1 100644
--- a/Source/Math/QuantizedMatrix.h
+++ b/Source/Math/QuantizedMatrix.h
@@ -119,6 +119,4 @@ class MATH_API QuantizedMatrix
     template <typename T>
     friend class MatrixQuantizer;
 };
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/BinaryReader/BinaryFile.cpp b/Source/Readers/BinaryReader/BinaryFile.cpp
index 5edc3b3adbda..ea444d93a5f3 100644
--- a/Source/Readers/BinaryReader/BinaryFile.cpp
+++ b/Source/Readers/BinaryReader/BinaryFile.cpp
@@ -1403,6 +1403,4 @@ bool SectionStats::AccumulateData(ElemType* dataSource, size_t numRecords, size_
     // done with data
     return false;
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/BinaryReader/BinaryReader.h b/Source/Readers/BinaryReader/BinaryReader.h
index 98b9c2c4c366..f23942d1a962 100644
--- a/Source/Readers/BinaryReader/BinaryReader.h
+++ b/Source/Readers/BinaryReader/BinaryReader.h
@@ -686,6 +686,4 @@ size_t RoundUp(size_t value, size_t size);
 // HIGH and LOW DWORD functions
 DWORD HIDWORD(size_t size);
 DWORD LODWORD(size_t size);
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/BinaryReader/BinaryWriter.cpp b/Source/Readers/BinaryReader/BinaryWriter.cpp
index 67ba842dc547..855aa5e1ea13 100644
--- a/Source/Readers/BinaryReader/BinaryWriter.cpp
+++ b/Source/Readers/BinaryReader/BinaryWriter.cpp
@@ -439,6 +439,4 @@ void BinaryWriter<ElemType>::SaveMapping(std::wstring saveId, const std::map<typ
 // instantiate all the combinations we expect to be used
 template class BinaryWriter<double>;
 template class BinaryWriter<float>;
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/BinaryReader/Exports.cpp b/Source/Readers/BinaryReader/Exports.cpp
index b66e3371efa2..d5c680a34582 100644
--- a/Source/Readers/BinaryReader/Exports.cpp
+++ b/Source/Readers/BinaryReader/Exports.cpp
@@ -43,6 +43,4 @@ extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
 {
     GetWriter(pwriter);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/HTKMLFReader/DataReader.cpp b/Source/Readers/HTKMLFReader/DataReader.cpp
index 7e0132466b85..b226a08efb05 100644
--- a/Source/Readers/HTKMLFReader/DataReader.cpp
+++ b/Source/Readers/HTKMLFReader/DataReader.cpp
@@ -58,6 +58,4 @@ void Trim(std::string& str)
         str.erase(found + 1);
 }
 #endif
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/ImageReader/Exports.cpp b/Source/Readers/ImageReader/Exports.cpp
index 596ecbbac849..f1e821ca7e6c 100644
--- a/Source/Readers/ImageReader/Exports.cpp
+++ b/Source/Readers/ImageReader/Exports.cpp
@@ -26,6 +26,4 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
 {
     GetReader(preader);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/Kaldi2Reader/htkfeatio_utils.h b/Source/Readers/Kaldi2Reader/htkfeatio_utils.h
index 2af50090ec43..d4680a9d46d6 100644
--- a/Source/Readers/Kaldi2Reader/htkfeatio_utils.h
+++ b/Source/Readers/Kaldi2Reader/htkfeatio_utils.h
@@ -37,6 +37,4 @@ inline std::string trimmed(std::string str)
         str.erase(found + 1);
 
     return str;
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/LMSequenceReader/Exports.cpp b/Source/Readers/LMSequenceReader/Exports.cpp
index 9a7746eb9450..e0474481fe13 100644
--- a/Source/Readers/LMSequenceReader/Exports.cpp
+++ b/Source/Readers/LMSequenceReader/Exports.cpp
@@ -26,6 +26,4 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
 {
     GetReader(preader);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/LUSequenceReader/Exports.cpp b/Source/Readers/LUSequenceReader/Exports.cpp
index 5f6a968ec6ba..9c9c31c3ab21 100644
--- a/Source/Readers/LUSequenceReader/Exports.cpp
+++ b/Source/Readers/LUSequenceReader/Exports.cpp
@@ -28,6 +28,4 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
 {
     GetReader(preader);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/LUSequenceReader/LUSequenceWriter.cpp b/Source/Readers/LUSequenceReader/LUSequenceWriter.cpp
index c9f87ab6cabc..e6ef3083d1f9 100644
--- a/Source/Readers/LUSequenceReader/LUSequenceWriter.cpp
+++ b/Source/Readers/LUSequenceReader/LUSequenceWriter.cpp
@@ -177,6 +177,4 @@ void LUSequenceWriter<ElemType>::Save(std::wstring& outputFile, const Matrix<Ele
 
 template class LUSequenceWriter<float>;
 template class LUSequenceWriter<double>;
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/LUSequenceReader/LUSequenceWriter.h b/Source/Readers/LUSequenceReader/LUSequenceWriter.h
index 80d0810d6453..3e0d340c78e0 100644
--- a/Source/Readers/LUSequenceReader/LUSequenceWriter.h
+++ b/Source/Readers/LUSequenceReader/LUSequenceWriter.h
@@ -60,6 +60,4 @@ class LUSequenceWriter : public IDataWriter<ElemType>
     virtual void Destroy();
     virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
 };
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/Readers/UCIFastReader/Exports.cpp b/Source/Readers/UCIFastReader/Exports.cpp
index 73aa0118df34..3a29033f0909 100644
--- a/Source/Readers/UCIFastReader/Exports.cpp
+++ b/Source/Readers/UCIFastReader/Exports.cpp
@@ -26,6 +26,4 @@ extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
 {
     GetReader(preader);
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h
index 610578400b8b..e2d1a47923ec 100644
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@@ -13,203 +13,203 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    /*static*/ struct DataReaderHelpers
-    {
+/*static*/ struct DataReaderHelpers
+{
+
+    // -------------------------------------------------------------------
+    // GetMinibatchIntoNetwork() -- get one minibatch from Reader (this->trainSetDataReader) into Network (this->net)
+    // Returns false if no data is read. In that case, no other return value can be expected to contain meaningful values (e.g. actualMBSize will be unchanged).
+    // Sets actualMBSize to the number of matrix columns. Note that 0 is a valid value to be returned for actualMBSize, caller must handle that correctly.
+    // -------------------------------------------------------------------
 
-        // -------------------------------------------------------------------
-        // GetMinibatchIntoNetwork() -- get one minibatch from Reader (this->trainSetDataReader) into Network (this->net)
-        // Returns false if no data is read. In that case, no other return value can be expected to contain meaningful values (e.g. actualMBSize will be unchanged).
-        // Sets actualMBSize to the number of matrix columns. Note that 0 is a valid value to be returned for actualMBSize, caller must handle that correctly.
-        // -------------------------------------------------------------------
-
-        // Note: This will go away with the redesigned reader interface.
-        // TODO: callers of this often do ComputationNetwork::BumpEvalTimeStamp(featureNodes) and also for labels; we should eliminate the need for this.
-        template <class ElemType>
-        static bool GetMinibatchIntoNetwork(IDataReader<ElemType>& trainSetDataReader,
-            ComputationNetworkPtr net,
-            ComputationNodeBasePtr criterionNode,
-            bool useDistributedMBReading,
-            bool useParallelTrain,
+    // Note: This will go away with the redesigned reader interface.
+    // TODO: callers of this often do ComputationNetwork::BumpEvalTimeStamp(featureNodes) and also for labels; we should eliminate the need for this.
+    template <class ElemType>
+    static bool GetMinibatchIntoNetwork(IDataReader<ElemType>& trainSetDataReader,
+                                        ComputationNetworkPtr net,
+                                        ComputationNodeBasePtr criterionNode,
+                                        bool useDistributedMBReading,
+                                        bool useParallelTrain,
                                         std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
                                         size_t& actualMBSize)
+    {
+        auto pMBLayout = net->GetMBLayoutPtr();
+        // Reading consists of a sequence of Reader API calls:
+        //  - GetMinibatch() --fills the inputMatrices
+        //  - SetActualMiniBatchSizeFromFeatures()  --tells Network to resize the nodes' buffers
+        //  - CopyMBLayoutTo()   --copies the MBLayout from Reader to Network
+        //  - VerifyActualNumParallelSequences()  --(refactoring left-over) verify that MBLayout is consistent with #parallel sequences
+        // with the special twist that in presence of parallelization, there is some decimation involved.
+
+        bool wasDataRead = trainSetDataReader.GetMinibatch(inputMatrices); // fill in the minibatch data into the Input nodes' buffers directly
+        // If this returns false, the matrices may contain garbage or not sized to 0 columns.
+        // On the other hand, if it returns a 0-column matrix, that would be a perfectly cromulent minibatch (in case of data parallelism with distributed reading).
+
+        // if no data read then we are done
+        if (!wasDataRead)
+            return false;
+
+        // get some additional information when doing sequence training
+        // TODO: This should not need to be called in case of wasDataRead == false, since in that case, returned values are invalid.
+        if ((criterionNode != nullptr) && (criterionNode->OperationName() == L"SequenceWithSoftmax"))
         {
-            auto pMBLayout = net->GetMBLayoutPtr();
-            // Reading consists of a sequence of Reader API calls:
-            //  - GetMinibatch() --fills the inputMatrices
-            //  - SetActualMiniBatchSizeFromFeatures()  --tells Network to resize the nodes' buffers
-            //  - CopyMBLayoutTo()   --copies the MBLayout from Reader to Network
-            //  - VerifyActualNumParallelSequences()  --(refactoring left-over) verify that MBLayout is consistent with #parallel sequences
-            // with the special twist that in presence of parallelization, there is some decimation involved.
-
-            bool wasDataRead = trainSetDataReader.GetMinibatch(inputMatrices); // fill in the minibatch data into the Input nodes' buffers directly
-            // If this returns false, the matrices may contain garbage or not sized to 0 columns.
-            // On the other hand, if it returns a 0-column matrix, that would be a perfectly cromulent minibatch (in case of data parallelism with distributed reading).
-
-            // if no data read then we are done
-            if (!wasDataRead)
-                return false;
-
-            // get some additional information when doing sequence training
-            // TODO: This should not need to be called in case of wasDataRead == false, since in that case, returned values are invalid.
-            if ((criterionNode != nullptr) && (criterionNode->OperationName() == L"SequenceWithSoftmax"))
-            {
-                auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNode);
-                auto latticeinput = node->getLatticePtr();
-                auto uids = node->getuidprt();
-                auto boundaries = node->getboundaryprt();
-                auto extrauttmap = node->getextrauttmap();
+            auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNode);
+            auto latticeinput = node->getLatticePtr();
+            auto uids = node->getuidprt();
+            auto boundaries = node->getboundaryprt();
+            auto extrauttmap = node->getextrauttmap();
 
-                trainSetDataReader.GetMinibatch4SE(*latticeinput, *uids, *boundaries, *extrauttmap);
-            }
-
-            // get layout meta-data
-            trainSetDataReader.CopyMBLayoutTo(pMBLayout);
-
-            // decimate if needed. Decimation happens in-place.
-            if (!useDistributedMBReading && useParallelTrain)
-                DecimateMinibatch(inputMatrices, g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank(), net->GetMBLayoutPtr());
-
-            // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
-            // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
-            std::set<Matrix<ElemType>*> matrices;
-            for (const auto & iter : inputMatrices)
-                matrices.insert(iter.second);
-            for (auto & node : net->FeatureNodes())
-                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
-                    node->NotifyFunctionValuesMBSizeModified();
-            for (auto & node : net->LabelNodes())
-                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
-                    node->NotifyFunctionValuesMBSizeModified();
-
-            // get MB size and tell Network to update its nodes' buffers based on what's in the input matrices
-            // Note: Decimation may have reduced this to 0 frames. We still must return 'true'.
-            // BUGBUG: This has a definitional problem once we support multiple feature streams with different lenghts.
-            actualMBSize = net->DetermineActualMBSizeFromFeatures();
-
-            return true;
+            trainSetDataReader.GetMinibatch4SE(*latticeinput, *uids, *boundaries, *extrauttmap);
         }
 
-        // -------------------------------------------------------------------
-        // DecimateMinibatch - decimate minibatch for parallelization
-        // -------------------------------------------------------------------
-        // non-inplace decimation , to be used in subminibatch implementation 
-        // returns a subset of parallel sequences
-        template <class ElemType>
-        static pair<size_t, size_t> DecimateMinibatch(const std::map<std::wstring, Matrix<ElemType>*> MB,     // input matrices
-                                                      std::map<std::wstring, Matrix<ElemType>*>& decimatedMB, // output decimated matrices.
-                                                      MBLayoutPtr pMBLayout,                                  // input MBLayout
-                                                      MBLayoutPtr& pDecimateMBLayout,                         // output decimated MBLayout (note: cannot work in-place)
-                                                      int numWorker, int rank)
+        // get layout meta-data
+        trainSetDataReader.CopyMBLayoutTo(pMBLayout);
+
+        // decimate if needed. Decimation happens in-place.
+        if (!useDistributedMBReading && useParallelTrain)
+            DecimateMinibatch(inputMatrices, g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank(), net->GetMBLayoutPtr());
+
+        // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
+        // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
+        std::set<Matrix<ElemType>*> matrices;
+        for (const auto& iter : inputMatrices)
+            matrices.insert(iter.second);
+        for (auto& node : net->FeatureNodes())
+            if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
+                node->NotifyFunctionValuesMBSizeModified();
+        for (auto& node : net->LabelNodes())
+            if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
+                node->NotifyFunctionValuesMBSizeModified();
+
+        // get MB size and tell Network to update its nodes' buffers based on what's in the input matrices
+        // Note: Decimation may have reduced this to 0 frames. We still must return 'true'.
+        // BUGBUG: This has a definitional problem once we support multiple feature streams with different lenghts.
+        actualMBSize = net->DetermineActualMBSizeFromFeatures();
+
+        return true;
+    }
+
+    // -------------------------------------------------------------------
+    // DecimateMinibatch - decimate minibatch for parallelization
+    // -------------------------------------------------------------------
+    // non-inplace decimation , to be used in subminibatch implementation
+    // returns a subset of parallel sequences
+    template <class ElemType>
+    static pair<size_t, size_t> DecimateMinibatch(const std::map<std::wstring, Matrix<ElemType>*> MB,     // input matrices
+                                                  std::map<std::wstring, Matrix<ElemType>*>& decimatedMB, // output decimated matrices.
+                                                  MBLayoutPtr pMBLayout,                                  // input MBLayout
+                                                  MBLayoutPtr& pDecimateMBLayout,                         // output decimated MBLayout (note: cannot work in-place)
+                                                  int numWorker, int rank)
+    {
+        size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
+        size_t nT = pMBLayout->GetNumTimeSteps();
+
+        // decide start column and end column
+        size_t st = numParallelSequences * (size_t) rank / numWorker;
+        size_t en = numParallelSequences * (size_t)(rank + 1) / numWorker;
+        en = en > numParallelSequences ? numParallelSequences : en; // TODO: why are these two tests necessary?
+        en = (rank == numWorker - 1) ? numParallelSequences : en;
+        size_t numNewParallelSequence = en - st;
+
+        // begin decimate matrices
+        size_t rv = 0;
+        for (const auto& it : MB)
         {
-            size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
-            size_t nT = pMBLayout->GetNumTimeSteps();
-
-            // decide start column and end column 
-            size_t st = numParallelSequences * (size_t)rank / numWorker;
-            size_t en = numParallelSequences * (size_t)(rank + 1) / numWorker;
-            en = en > numParallelSequences ? numParallelSequences : en; // TODO: why are these two tests necessary?
-            en = (rank == numWorker - 1) ? numParallelSequences : en;
-            size_t numNewParallelSequence = en - st;
-
-            // begin decimate matrices 
-            size_t rv = 0;
-            for (const auto& it : MB)
-            {
-                wstring name = it.first;
+            wstring name = it.first;
             MSR::CNTK::Matrix<ElemType>& mat = *it.second;
-                size_t numRows = mat.GetNumRows();
-                size_t numCols = mat.GetNumCols();
-                int devID = mat.GetDeviceId();
+            size_t numRows = mat.GetNumRows();
+            size_t numCols = mat.GetNumCols();
+            int devID = mat.GetDeviceId();
 
-                if (rv == 0)
-                    rv = numCols;
-                else if (rv != numCols)
+            if (rv == 0)
+                rv = numCols;
+            else if (rv != numCols)
                 LogicError("DecimateMinibatch: Inconsistent number of columns among inputs (found %d and %d).", (int) rv, (int) numCols);
 
-                if (nT != numCols / numParallelSequences)
-                    LogicError("ERROR: MBLayout borked, GetNumTimeSteps() mismatches minibatch number of columns\n");
+            if (nT != numCols / numParallelSequences)
+                LogicError("ERROR: MBLayout borked, GetNumTimeSteps() mismatches minibatch number of columns\n");
 
-                decimatedMB[name] = new Matrix<ElemType>(devID);
+            decimatedMB[name] = new Matrix<ElemType>(devID);
             decimatedMB[name]->AssignRowSliceValuesOf(mat.Reshaped(numRows * numParallelSequences, nT), st * numRows, (en - st) * numRows);
             decimatedMB[name]->Reshape(numRows, numNewParallelSequence * nT);
-                // If we had a RowSlice function, we would like to write in this way 
-                // decimatedMB[name]->SetValue(mat.Reshaped(nRows*nSequence, nT).RowSlice( st*nRows , (en-st)*nRows).Reshaped(nRows, nNewParallelSequence*nT));
-            }
-            // decimate MBLayout as well 
-            pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT);
+            // If we had a RowSlice function, we would like to write in this way
+            // decimatedMB[name]->SetValue(mat.Reshaped(nRows*nSequence, nT).RowSlice( st*nRows , (en-st)*nRows).Reshaped(nRows, nNewParallelSequence*nT));
+        }
+        // decimate MBLayout as well
+        pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT);
 #if 1
-            // now copy over all sequence info records that are inside the range, with adjusted 's'
-            const auto& sequences = pMBLayout->GetAllSequences();
-            for (const auto& seq : sequences)
+        // now copy over all sequence info records that are inside the range, with adjusted 's'
+        const auto& sequences = pMBLayout->GetAllSequences();
+        for (const auto& seq : sequences)
+        {
+            if (seq.s >= st && seq.s < en)
             {
-                if (seq.s >= st && seq.s < en)
-                {
-                    auto shiftedSeq = seq;
+                auto shiftedSeq = seq;
                 shiftedSeq.s -= st; // these sequences have shifted up by 'st' sequences
-                    pDecimateMBLayout->AddSequence(shiftedSeq);
-                }
+                pDecimateMBLayout->AddSequence(shiftedSeq);
             }
+        }
 #else
         for (size_t t = 0; t < nT; t++)
             for (size_t id = 0; id < numNewParallelSequence; id++)
                 pDecimateMBLayout->Set(id, t, pMBLayout->Get(id + st, t));
 #endif
 
-            return pair<size_t, size_t>(st, en);
-        }
+        return pair<size_t, size_t>(st, en);
+    }
 
-        // in-place decimation, for use with data-parallel processing
-        // returns a subset of parallel sequences
-        template <class ElemType>
-        static pair<size_t, size_t> DecimateMinibatch(std::map<std::wstring, Matrix<ElemType>*>& mb, // matrix to be decimated
-                                                      int numprocs, int rank,                        // rank info
-                                                      MBLayoutPtr pMBLayout)                         // get decimated as well
+    // in-place decimation, for use with data-parallel processing
+    // returns a subset of parallel sequences
+    template <class ElemType>
+    static pair<size_t, size_t> DecimateMinibatch(std::map<std::wstring, Matrix<ElemType>*>& mb, // matrix to be decimated
+                                                  int numprocs, int rank,                        // rank info
+                                                  MBLayoutPtr pMBLayout)                         // get decimated as well
+    {
+        if (numprocs == 1)
+            return pair<size_t, size_t>(0, pMBLayout->GetNumParallelSequences());
+        // no need to do inplace decimation if numproc == 1
+
+        // allocate space for non-inplace decimation
+        MBLayoutPtr pDecimatedMB = make_shared<MBLayout>();
+        std::map<wstring, Matrix<ElemType>*> decimatedMB;
+        // call in-place decimation
+        pair<size_t, size_t> selected = DecimateMinibatch(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
+        // move the data
+        for (auto k : mb)
         {
-            if (numprocs == 1)
-                return pair<size_t, size_t>(0, pMBLayout->GetNumParallelSequences());
-            // no need to do inplace decimation if numproc == 1 
-
-            // allocate space for non-inplace decimation 
-            MBLayoutPtr pDecimatedMB = make_shared<MBLayout>();
-            std::map<wstring, Matrix<ElemType>*> decimatedMB;
-            // call in-place decimation 
-            pair<size_t, size_t> selected = DecimateMinibatch(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
-            // move the data 
-            for (auto k : mb)
-            {
-                auto name = k.first;
-                k.second->SetValue(*decimatedMB[name]);
-                delete decimatedMB[name];
-                decimatedMB[name] = nullptr;
-            }
-            pMBLayout->MoveFrom(pDecimatedMB);
-            return selected; 
+            auto name = k.first;
+            k.second->SetValue(*decimatedMB[name]);
+            delete decimatedMB[name];
+            decimatedMB[name] = nullptr;
         }
-
-        // ===================================================================
-        // SubminibatchHelpers -- helper for sub-minibatch implementation
-        // TODO: Can this just exist inside SGD.cpp?
-        // ===================================================================
-
-        // A sub-minibathc is a part of a minibatch which helps computing large minibatches that cannot load into GPU memory in one forward-backward computation 
-        // The usage would be : 
-        //        SubminibatchHelpers sbhelper;    
-        //        for (;;)
-        //        {
-        //            size_t nsb=sb.GetMinibatchIntoCache(...); 
-        //            for (size_t i=0; i<nsb; i++)
-        //            {
-        //                sbhelper.GetSubMinibatchToNet(i); 
-        //                net.Evaluate(criterionNodes[0]);
-        //                sbhelper.DoneWithCurrentSubMinibatch(); 
-        //            }
-        //            UpdateWeights(...);
-        //        }
+        pMBLayout->MoveFrom(pDecimatedMB);
+        return selected;
+    }
+
+    // ===================================================================
+    // SubminibatchHelpers -- helper for sub-minibatch implementation
+    // TODO: Can this just exist inside SGD.cpp?
+    // ===================================================================
+
+    // A sub-minibathc is a part of a minibatch which helps computing large minibatches that cannot load into GPU memory in one forward-backward computation
+    // The usage would be :
+    //        SubminibatchHelpers sbhelper;
+    //        for (;;)
+    //        {
+    //            size_t nsb=sb.GetMinibatchIntoCache(...);
+    //            for (size_t i=0; i<nsb; i++)
+    //            {
+    //                sbhelper.GetSubMinibatchToNet(i);
+    //                net.Evaluate(criterionNodes[0]);
+    //                sbhelper.DoneWithCurrentSubMinibatch();
+    //            }
+    //            UpdateWeights(...);
+    //        }
 
     template <class ElemType>
     class SubminibatchDispatcher
     {
-        private:
+    private:
         typedef std::vector<shared_ptr<const msra::dbn::latticesource::latticepair>> Lattice;
         typedef std::vector<size_t> Uid;
         typedef std::vector<size_t> ExtrauttMap;
@@ -221,7 +221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         typedef std::vector<size_t>* BoundariesPtr;
         typedef std::map<std::wstring, Matrix<ElemType>*> Matrices;
 
-            // member variables served as caching space 
+        // member variables served as caching space
         Matrices m_inputMatricesCache;
         MBLayoutPtr m_MBLayoutCache;
         Lattice m_LatticeCache;
@@ -234,16 +234,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_hasLattices;
 
         Matrices m_cachedGradient;
-            // we also need to remember where to put into the net
+        // we also need to remember where to put into the net
         MBLayoutPtr m_NetMBLayoutPtr;
         std::map<wstring, shared_ptr<ComputationNode<ElemType>>> m_LearnableNodePtr;
-            // followings are lattice-related 
+        // followings are lattice-related
         Matrices m_NetInputMatrixPtr; // TODO: camelCase for all m_Net...
         LatticePtr m_NetLatticePtr;
         UidPtr m_NetUidPtr;
         ExtrauttMapPtr m_NetExtrauttMapPtr;
         BoundariesPtr m_NetBoundariesPtr;
-            // we remember the pointer to the learnable Nodes so that we can accumulate the gradient once a sub-minibatch is done 
+        // we remember the pointer to the learnable Nodes so that we can accumulate the gradient once a sub-minibatch is done
 
         size_t m_numParallelSequences; // number of paralle sequence in the cached matrix and MBLayout
         size_t m_numSubminibatches;    // how many subminibatches we are going to use ?
@@ -252,196 +252,196 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         std::vector<shared_ptr<ComputationNode<ElemType>>> m_NetEvaluationNodes;
         std::map<wstring, shared_ptr<IStatefulNode>> m_NetStatefulNodes; // we need to Export/Import states of stateful nodes when we swtich subminibatches
 
-        private:
-            void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root, std::map<wstring, shared_ptr<IStatefulNode>>& statefulnode)
+    private:
+        void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root, std::map<wstring, shared_ptr<IStatefulNode>>& statefulnode)
+        {
+            const std::list<ComputationNodeBasePtr> evalorder = net.GetEvalOrder(root);
+            for (auto& x : evalorder)
             {
-                const std::list<ComputationNodeBasePtr> evalorder = net.GetEvalOrder(root);
-                for (auto& x : evalorder)
-                {
-                    wstring name = x->GetName();
+                wstring name = x->GetName();
                 if (statefulnode.find(name) != statefulnode.end())
                     continue; // already in the list
-                    shared_ptr<IStatefulNode> pNode = dynamic_pointer_cast<IStatefulNode>(x);
-                    if (pNode)
-                    {
-                        statefulnode[name] = pNode;
-                    }
+                shared_ptr<IStatefulNode> pNode = dynamic_pointer_cast<IStatefulNode>(x);
+                if (pNode)
+                {
+                    statefulnode[name] = pNode;
                 }
             }
+        }
 
-            std::map<wstring, shared_ptr<IStatefulNode>> EnumerateStatefulNode(ComputationNetwork& net,
-                                                                               const std::vector<ComputationNodeBasePtr>& criterionNode,
-                                                                               const std::vector<ComputationNodeBasePtr>& evaluationNode)
+        std::map<wstring, shared_ptr<IStatefulNode>> EnumerateStatefulNode(ComputationNetwork& net,
+                                                                           const std::vector<ComputationNodeBasePtr>& criterionNode,
+                                                                           const std::vector<ComputationNodeBasePtr>& evaluationNode)
+        {
+            std::map<wstring, shared_ptr<IStatefulNode>> statefulNodes;
+            for (auto& root : criterionNode)
             {
-                std::map<wstring, shared_ptr<IStatefulNode>> statefulNodes;
-                for (auto& root : criterionNode)
-                {
-                    EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
-                }
-                for (auto& root : evaluationNode)
-                {
-                    EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
-                }
-                return statefulNodes;
+                EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
+            }
+            for (auto& root : evaluationNode)
+            {
+                EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
             }
+            return statefulNodes;
+        }
 
-        public:
+    public:
         SubminibatchDispatcher()
             : m_MBLayoutCache(nullptr), m_NetLatticePtr(nullptr), m_NetExtrauttMapPtr(nullptr), m_NetUidPtr(nullptr), m_NetBoundariesPtr(nullptr)
         {
         }
 
         void Init(ComputationNetworkPtr& net,
-                      const std::list<ComputationNodeBasePtr>& learnableNodes,
-                      const std::vector<ComputationNodeBasePtr>& criterionNodes,
-                      const std::vector<ComputationNodeBasePtr>& evaluationNodes)
+                  const std::list<ComputationNodeBasePtr>& learnableNodes,
+                  const std::vector<ComputationNodeBasePtr>& criterionNodes,
+                  const std::vector<ComputationNodeBasePtr>& evaluationNodes)
+        {
+            m_MBLayoutCache = make_shared<MBLayout>();
+            m_NetCriterionAccumulator = make_shared<Matrix<ElemType>>(1, 1, net->GetDeviceId());
+            m_NetEvaluationAccumulator = make_shared<Matrix<ElemType>>(1, evaluationNodes.size(), net->GetDeviceId());
+            // remember ptrs to learnable nodes
+            for (auto x : learnableNodes)
             {
-                m_MBLayoutCache = make_shared<MBLayout>();
-                m_NetCriterionAccumulator = make_shared<Matrix<ElemType>>(1, 1, net->GetDeviceId());
-                m_NetEvaluationAccumulator = make_shared<Matrix<ElemType>>(1, evaluationNodes.size(), net->GetDeviceId());
-                // remember ptrs to learnable nodes
-                for (auto x : learnableNodes)
-                {
-                    shared_ptr<ComputationNode<ElemType>> pLearnableNode = dynamic_pointer_cast<ComputationNode<ElemType>>(x);
-                    wstring nodename = x->NodeName();
-                    m_LearnableNodePtr[nodename] = pLearnableNode;
-                }
-                for (auto& x : criterionNodes)
-                {
-                    m_NetCriterionNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x));
-                }
-                for (auto& x : evaluationNodes)
-                {
-                    m_NetEvaluationNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x));
-                }
+                shared_ptr<ComputationNode<ElemType>> pLearnableNode = dynamic_pointer_cast<ComputationNode<ElemType>>(x);
+                wstring nodename = x->NodeName();
+                m_LearnableNodePtr[nodename] = pLearnableNode;
+            }
+            for (auto& x : criterionNodes)
+            {
+                m_NetCriterionNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x));
+            }
+            for (auto& x : evaluationNodes)
+            {
+                m_NetEvaluationNodes.push_back(dynamic_pointer_cast<ComputationNode<ElemType>>(x));
+            }
             m_NetCriterionAccumulator->SetValue((ElemType) 0);
             m_NetEvaluationAccumulator->SetValue((ElemType) 0);
 
-                // emulate all the nodes, find nodes that have state 
-                m_NetStatefulNodes = EnumerateStatefulNode(*net, criterionNodes, evaluationNodes);
-                for (auto x : m_NetStatefulNodes)
-                {
-                    wstring name = x.first;
-                    m_NetStates[name] = vector<shared_ptr<INodeState>>();
-                }
-
-                // for sequence training 
-                if (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
-                {
-                    auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
-                    assert(node);
-                    m_NetLatticePtr = node->getLatticePtr(); 
-                    m_NetExtrauttMapPtr = node->getextrauttmap(); 
-                    m_NetUidPtr = node->getuidprt();
-                    m_NetBoundariesPtr = node->getboundaryprt(); 
-                    m_hasLattices = true; 
-                }
-                else
-                {
-                    m_NetLatticePtr = nullptr; 
-                    m_NetExtrauttMapPtr = nullptr; 
-                    m_NetUidPtr = nullptr; 
-                    m_NetBoundariesPtr = nullptr; 
-                    m_hasLattices = false; 
-                }
+            // emulate all the nodes, find nodes that have state
+            m_NetStatefulNodes = EnumerateStatefulNode(*net, criterionNodes, evaluationNodes);
+            for (auto x : m_NetStatefulNodes)
+            {
+                wstring name = x.first;
+                m_NetStates[name] = vector<shared_ptr<INodeState>>();
             }
 
-            ~SubminibatchDispatcher()
+            // for sequence training
+            if (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax")
+            {
+                auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
+                assert(node);
+                m_NetLatticePtr = node->getLatticePtr();
+                m_NetExtrauttMapPtr = node->getextrauttmap();
+                m_NetUidPtr = node->getuidprt();
+                m_NetBoundariesPtr = node->getboundaryprt();
+                m_hasLattices = true;
+            }
+            else
             {
-                // TODO: remove these by using shared_ptr 
+                m_NetLatticePtr = nullptr;
+                m_NetExtrauttMapPtr = nullptr;
+                m_NetUidPtr = nullptr;
+                m_NetBoundariesPtr = nullptr;
+                m_hasLattices = false;
+            }
+        }
 
-                for (auto x : m_inputMatricesCache)
-                {
-                    delete x.second;
-                }
+        ~SubminibatchDispatcher()
+        {
+            // TODO: remove these by using shared_ptr
 
-                for (auto x : m_cachedGradient)
-                {
-                    delete x.second;
-                }
+            for (auto x : m_inputMatricesCache)
+            {
+                delete x.second;
             }
 
-            size_t GetMinibatchIntoCache(IDataReader<ElemType>& trainSetDataReader,
-                                         ComputationNetwork& net,
-                                     std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
-                                         size_t requestedSubminibatches)
+            for (auto x : m_cachedGradient)
             {
-                // first, remember interface to the net 
-                m_NetMBLayoutPtr = net.GetMBLayoutPtr();
-                m_NetInputMatrixPtr = inputMatrices;
+                delete x.second;
+            }
+        }
 
-                // second, get data from reader, stored it in cache 
-                // 1. for each key, allocate the specific matrix on device 
-                for (auto pa : inputMatrices)
+        size_t GetMinibatchIntoCache(IDataReader<ElemType>& trainSetDataReader,
+                                     ComputationNetwork& net,
+                                     std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+                                     size_t requestedSubminibatches)
+        {
+            // first, remember interface to the net
+            m_NetMBLayoutPtr = net.GetMBLayoutPtr();
+            m_NetInputMatrixPtr = inputMatrices;
+
+            // second, get data from reader, stored it in cache
+            // 1. for each key, allocate the specific matrix on device
+            for (auto pa : inputMatrices)
+            {
+                wstring name = pa.first;
+                Matrix<ElemType>* M = pa.second;
+                if (m_inputMatricesCache.find(name) == m_inputMatricesCache.end())
                 {
-                    wstring name = pa.first;
-                    Matrix<ElemType>* M = pa.second;
-                    if (m_inputMatricesCache.find(name) == m_inputMatricesCache.end())
-                    {
-                        m_inputMatricesCache[name] = new Matrix<ElemType>(*M, M->GetDeviceId()); // deep copy from M 
-                    }
-                    else
-                    {
-                        m_inputMatricesCache[name]->SetValue(*M);
-                    }
+                    m_inputMatricesCache[name] = new Matrix<ElemType>(*M, M->GetDeviceId()); // deep copy from M
                 }
-                // 2. MBlayout 
-                m_MBLayoutCache->CopyFrom(net.GetMBLayoutPtr());
-                size_t nParallelSequences = m_MBLayoutCache->GetNumParallelSequences();
-
-                // 3. for bits in seq. training 
-                if (m_hasLattices)
+                else
                 {
-                    m_LatticeCache.clear(); 
-                    m_uidCache.clear(); 
-                    m_extrauttmapCache.clear(); 
-                    m_BoundariesCache.clear();
-
-                    m_LatticeCache = *m_NetLatticePtr; 
-                    m_uidCache = *m_NetUidPtr; 
-                    m_extrauttmapCache = *m_NetExtrauttMapPtr; 
-                    m_BoundariesCache = *m_NetBoundariesPtr; 
+                    m_inputMatricesCache[name]->SetValue(*M);
                 }
+            }
+            // 2. MBlayout
+            m_MBLayoutCache->CopyFrom(net.GetMBLayoutPtr());
+            size_t nParallelSequences = m_MBLayoutCache->GetNumParallelSequences();
 
-                // subminibatches are cutted at the parallel sequence level; 
-                // if #requested subminibatch is larger than #parallel sequence, 
-                // we cannot split further; instead, each subsequence become a subminibatch 
-                size_t actualnumSubminibatches = requestedSubminibatches > nParallelSequences ? nParallelSequences : requestedSubminibatches;
+            // 3. for bits in seq. training
+            if (m_hasLattices)
+            {
+                m_LatticeCache.clear();
+                m_uidCache.clear();
+                m_extrauttmapCache.clear();
+                m_BoundariesCache.clear();
+
+                m_LatticeCache = *m_NetLatticePtr;
+                m_uidCache = *m_NetUidPtr;
+                m_extrauttmapCache = *m_NetExtrauttMapPtr;
+                m_BoundariesCache = *m_NetBoundariesPtr;
+            }
 
-                // 4. third, allocate space for accumulated gradient 
-                for (auto& n : m_LearnableNodePtr)
+            // subminibatches are cutted at the parallel sequence level;
+            // if #requested subminibatch is larger than #parallel sequence,
+            // we cannot split further; instead, each subsequence become a subminibatch
+            size_t actualnumSubminibatches = requestedSubminibatches > nParallelSequences ? nParallelSequences : requestedSubminibatches;
+
+            // 4. third, allocate space for accumulated gradient
+            for (auto& n : m_LearnableNodePtr)
+            {
+                auto node = n.second;
+                if (node->IsParameterUpdateRequired())
                 {
-                    auto node = n.second;
-                    if (node->IsParameterUpdateRequired())
-                    {
-                        wstring nodeName = node->GetName();
+                    wstring nodeName = node->GetName();
                     shared_ptr<ComputationNode<ElemType>> pLearnableNode = node;
                     auto funvalue = pLearnableNode->Value(); // gradient may not be allocated when this function is first called
-                        size_t nrow = funvalue.GetNumRows();
-                        size_t ncol = funvalue.GetNumCols();
-                        if (m_cachedGradient.find(nodeName) == m_cachedGradient.end())
-                        {
-                            // not allocated yet 
-                            m_cachedGradient[nodeName] = new Matrix<ElemType>(nrow, ncol, funvalue.GetDeviceId());
+                    size_t nrow = funvalue.GetNumRows();
+                    size_t ncol = funvalue.GetNumCols();
+                    if (m_cachedGradient.find(nodeName) == m_cachedGradient.end())
+                    {
+                        // not allocated yet
+                        m_cachedGradient[nodeName] = new Matrix<ElemType>(nrow, ncol, funvalue.GetDeviceId());
                         m_cachedGradient[nodeName]->SetValue((ElemType) 0);
-                        }
                     }
                 }
-                // 5. for stateful node 
-                for (auto x : m_NetStatefulNodes)
+            }
+            // 5. for stateful node
+            for (auto x : m_NetStatefulNodes)
+            {
+                wstring name = x.first;
+                if (m_NetStates[name].empty())
                 {
-                    wstring name = x.first;
-                    if (m_NetStates[name].empty())
-                    {
-                        // this only happens in the first minibatch in an epoch
-                        m_NetStates[name].resize(actualnumSubminibatches);
-                    }
+                    // this only happens in the first minibatch in an epoch
+                    m_NetStates[name].resize(actualnumSubminibatches);
                 }
-
-                return (m_numSubminibatches = actualnumSubminibatches);
             }
 
-            void DecimateLattices(
+            return (m_numSubminibatches = actualnumSubminibatches);
+        }
+
+        void DecimateLattices(
             LatticePtr decimatedLattices,         /* output: lattices after decimation*/
             BoundariesPtr decimatedBoundaryPtr,   /* output: boundary after decimation*/
             ExtrauttMapPtr decimatedExtraMapPtr,  /* output: extramap after decimation*/
@@ -451,138 +451,138 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             const ExtrauttMap extraMaps,          /* input: extra map to be decimated */
             const Uid uids,                       /* input: uid to be decimated*/
             pair<size_t, size_t> parallelSeqRange /* input: what parallel sequence range we are looking at */
-                )
-            {
-                size_t parallelSeqStId = parallelSeqRange.first; 
-                size_t parallelSeqEnId = parallelSeqRange.second; 
+            )
+        {
+            size_t parallelSeqStId = parallelSeqRange.first;
+            size_t parallelSeqEnId = parallelSeqRange.second;
 
-                decimatedLattices->clear();
-                decimatedBoundaryPtr->clear(); 
-                decimatedExtraMapPtr->clear(); 
-                decimatedUidPtr->clear();
+            decimatedLattices->clear();
+            decimatedBoundaryPtr->clear();
+            decimatedExtraMapPtr->clear();
+            decimatedUidPtr->clear();
 
-                size_t stFrame = 0; 
-                for (size_t iUtt = 0; iUtt < extraMaps.size(); iUtt++)
-                {
-                    size_t numFramesInThisUtterance = lattices[iUtt]->getnumframes();
+            size_t stFrame = 0;
+            for (size_t iUtt = 0; iUtt < extraMaps.size(); iUtt++)
+            {
+                size_t numFramesInThisUtterance = lattices[iUtt]->getnumframes();
                 size_t iParallelSeq = extraMaps[iUtt]; // i-th utterance belongs to iParallelSeq-th parallel sequence
-                    if (iParallelSeq >= parallelSeqStId && iParallelSeq < parallelSeqEnId)
-                    {
-                        // this utterance has been selected 
-                        decimatedLattices->push_back(lattices[iUtt]);
+                if (iParallelSeq >= parallelSeqStId && iParallelSeq < parallelSeqEnId)
+                {
+                    // this utterance has been selected
+                    decimatedLattices->push_back(lattices[iUtt]);
                     decimatedBoundaryPtr->insert(decimatedBoundaryPtr->end(), boundaries.begin() + stFrame, boundaries.begin() + stFrame + numFramesInThisUtterance);
-                        decimatedUidPtr->insert(decimatedUidPtr->end(), uids.begin() + stFrame, uids.begin() + stFrame + numFramesInThisUtterance); 
-                        decimatedExtraMapPtr->push_back(extraMaps[iUtt] - parallelSeqStId);
-                    }
-                    stFrame += numFramesInThisUtterance; 
+                    decimatedUidPtr->insert(decimatedUidPtr->end(), uids.begin() + stFrame, uids.begin() + stFrame + numFramesInThisUtterance);
+                    decimatedExtraMapPtr->push_back(extraMaps[iUtt] - parallelSeqStId);
                 }
+                stFrame += numFramesInThisUtterance;
             }
+        }
 
-            void GetSubMinibatchToNet(size_t iSubminibatch)
-            {
-                Matrices decimatedMatrices;
-                MBLayoutPtr decimatedLayout;
-                pair<size_t, size_t> seqRange = DataReaderHelpers::DecimateMinibatch(m_inputMatricesCache, decimatedMatrices, m_MBLayoutCache, decimatedLayout, m_numSubminibatches, iSubminibatch);
-                //  NOTE: deimatedMatrices must be released by caller
+        void GetSubMinibatchToNet(size_t iSubminibatch)
+        {
+            Matrices decimatedMatrices;
+            MBLayoutPtr decimatedLayout;
+            pair<size_t, size_t> seqRange = DataReaderHelpers::DecimateMinibatch(m_inputMatricesCache, decimatedMatrices, m_MBLayoutCache, decimatedLayout, m_numSubminibatches, iSubminibatch);
+            //  NOTE: deimatedMatrices must be released by caller
 
-                // base on the seqRange, we do the decimation for lattices and related variables 
+            // base on the seqRange, we do the decimation for lattices and related variables
             if (m_hasLattices)
             {
-                    DecimateLattices(
-                        /*output */ 
-                                    m_NetLatticePtr, m_NetBoundariesPtr, m_NetExtrauttMapPtr, m_NetUidPtr, 
-                        /*input to be decimated */
+                DecimateLattices(
+                    /*output */
+                    m_NetLatticePtr, m_NetBoundariesPtr, m_NetExtrauttMapPtr, m_NetUidPtr,
+                    /*input to be decimated */
                     m_LatticeCache, m_BoundariesCache, m_extrauttmapCache, m_uidCache,
-                        /* what range we want ? */
+                    /* what range we want ? */
                     seqRange);
-                }
+            }
 
-                //m_NetInputMatrixPtr = decimatedMatrices;
-                for (auto& x : decimatedMatrices)
-                {
-                    wstring name = x.first;
-                    m_NetInputMatrixPtr[name]->SetValue(*x.second);
+            //m_NetInputMatrixPtr = decimatedMatrices;
+            for (auto& x : decimatedMatrices)
+            {
+                wstring name = x.first;
+                m_NetInputMatrixPtr[name]->SetValue(*x.second);
                 delete x.second; // TODO: is it safe to delete here ? Yes! SetValue call cuda memcpy so it is a blocking call
-                    x.second = nullptr;
-                }
+                x.second = nullptr;
+            }
 
-                m_NetMBLayoutPtr->CopyFrom(decimatedLayout);
+            m_NetMBLayoutPtr->CopyFrom(decimatedLayout);
 
-                for (auto& x : m_NetStatefulNodes)
-                {
-                    wstring name = x.first;
+            for (auto& x : m_NetStatefulNodes)
+            {
+                wstring name = x.first;
                 shared_ptr<IStatefulNode> pNode = x.second;
-                    if (m_NetStates[name][iSubminibatch])
-                        pNode->ImportState(std::move(m_NetStates[name][iSubminibatch]));
-                }
+                if (m_NetStates[name][iSubminibatch])
+                    pNode->ImportState(std::move(m_NetStates[name][iSubminibatch]));
             }
+        }
 
-            // TODO: encapsulate it into a destructor? Note: Cannot throw exceptions in destructor.
-            void DoneWithCurrentSubMinibatch(size_t iSubminibatch)
+        // TODO: encapsulate it into a destructor? Note: Cannot throw exceptions in destructor.
+        void DoneWithCurrentSubMinibatch(size_t iSubminibatch)
+        {
+            // accumulate gradient here
+            for (auto x : m_cachedGradient)
             {
-                // accumulate gradient here 
-                for (auto x : m_cachedGradient)
+                wstring nodename = x.first;
+                if (m_LearnableNodePtr.find(nodename) == m_LearnableNodePtr.end())
                 {
-                    wstring nodename = x.first;
-                    if (m_LearnableNodePtr.find(nodename) == m_LearnableNodePtr.end())
-                    {
-                        RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LeanrableNode", nodename.c_str());
-                    }
-                    shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
-                    m_cachedGradient[nodename]->operator+=(pNode->Gradient());
-                pNode->Gradient().SetValue((ElemType) 0);
+                    RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LeanrableNode", nodename.c_str());
                 }
-                // accumulate criterion value 
-                Matrix<ElemType>::AddElementToElement(m_NetCriterionNodes[0]->Value(), 0, 0,
-                                                      *m_NetCriterionAccumulator, 0, 0);
+                shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
+                m_cachedGradient[nodename]->operator+=(pNode->Gradient());
+                pNode->Gradient().SetValue((ElemType) 0);
+            }
+            // accumulate criterion value
+            Matrix<ElemType>::AddElementToElement(m_NetCriterionNodes[0]->Value(), 0, 0,
+                                                  *m_NetCriterionAccumulator, 0, 0);
             m_NetCriterionNodes[0]->Value().SetValue((ElemType) 0);
-                // accumulate evaluation value 
-                for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
-                {
-                    Matrix<ElemType>::AddElementToElement(m_NetEvaluationNodes[i]->Value(), 0, 0,
-                                                          *m_NetEvaluationAccumulator, 0, i);
+            // accumulate evaluation value
+            for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
+            {
+                Matrix<ElemType>::AddElementToElement(m_NetEvaluationNodes[i]->Value(), 0, 0,
+                                                      *m_NetEvaluationAccumulator, 0, i);
                 m_NetEvaluationNodes[i]->Value().SetValue((ElemType) 0);
-                }
+            }
 
-                // Export node state 
-                for (auto& x : m_NetStatefulNodes)
-                {
-                    wstring name = x.first;
-                    m_NetStates[name][iSubminibatch] = x.second->ExportState();
-                }
+            // Export node state
+            for (auto& x : m_NetStatefulNodes)
+            {
+                wstring name = x.first;
+                m_NetStates[name][iSubminibatch] = x.second->ExportState();
             }
+        }
 
-            void DoneWithCurrentMinibatch()
+        void DoneWithCurrentMinibatch()
+        {
+            for (auto& x : m_cachedGradient)
             {
-                for (auto& x : m_cachedGradient)
-                {
-                    wstring name = x.first;
-                    Matrix<ElemType>* accumulategrad = x.second;
+                wstring name = x.first;
+                Matrix<ElemType>* accumulategrad = x.second;
 
-                    if (m_LearnableNodePtr.find(name) == m_LearnableNodePtr.end())
-                    {
-                        // should never happen, remove this code later
-                        RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LearnableNode", name.c_str());
-                    }
-                    m_LearnableNodePtr[name]->Gradient().SetValue(*accumulategrad);
-                x.second->SetValue((ElemType) 0);
+                if (m_LearnableNodePtr.find(name) == m_LearnableNodePtr.end())
+                {
+                    // should never happen, remove this code later
+                    RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LearnableNode", name.c_str());
                 }
-                // also revert net.m_MBLayoutPtr
-                m_NetMBLayoutPtr->CopyFrom(m_MBLayoutCache);
+                m_LearnableNodePtr[name]->Gradient().SetValue(*accumulategrad);
+                x.second->SetValue((ElemType) 0);
+            }
+            // also revert net.m_MBLayoutPtr
+            m_NetMBLayoutPtr->CopyFrom(m_MBLayoutCache);
 
-                //m_NetCriterionNodes[0]->Value().SetValue((ElemType)0);
-                Matrix<ElemType>::AddElementToElement(*m_NetCriterionAccumulator, 0, 0,
-                                                      m_NetCriterionNodes[0]->Value(), 0, 0);
+            //m_NetCriterionNodes[0]->Value().SetValue((ElemType)0);
+            Matrix<ElemType>::AddElementToElement(*m_NetCriterionAccumulator, 0, 0,
+                                                  m_NetCriterionNodes[0]->Value(), 0, 0);
             m_NetCriterionAccumulator->SetValue((ElemType) 0);
 
-                for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
-                {
-                    //m_NetEvaluationNodes[i]->Value().SetValue((ElemType)0);
-                    Matrix<ElemType>::AddElementToElement(*m_NetEvaluationAccumulator, 0, i,
-                                                          m_NetEvaluationNodes[i]->Value(), 0, 0);
-                }
-            m_NetEvaluationAccumulator->SetValue((ElemType) 0);
+            for (size_t i = 0; i < m_NetEvaluationNodes.size(); i++)
+            {
+                //m_NetEvaluationNodes[i]->Value().SetValue((ElemType)0);
+                Matrix<ElemType>::AddElementToElement(*m_NetEvaluationAccumulator, 0, i,
+                                                      m_NetEvaluationNodes[i]->Value(), 0, 0);
             }
-        };
+            m_NetEvaluationAccumulator->SetValue((ElemType) 0);
+        }
     };
+};
 } } }
diff --git a/Source/SGDLib/IDistGradAggregator.h b/Source/SGDLib/IDistGradAggregator.h
index 87616642c833..0a4c1124022c 100644
--- a/Source/SGDLib/IDistGradAggregator.h
+++ b/Source/SGDLib/IDistGradAggregator.h
@@ -41,6 +41,7 @@ class IDistGradAggregator
 };
 
 #define UsingIDistGradAggregatorMembers           \
+    \
 protected:                                        \
     using IDistGradAggregator<ElemType>::m_mpi;   \
     using IDistGradAggregator<ElemType>::NumProc; \
diff --git a/Source/SGDLib/MultiNetworksEvaluator.h b/Source/SGDLib/MultiNetworksEvaluator.h
index 55554da8777a..2b9604d295c5 100644
--- a/Source/SGDLib/MultiNetworksEvaluator.h
+++ b/Source/SGDLib/MultiNetworksEvaluator.h
@@ -509,9 +509,9 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
                     (*ptr)->ForwardProp(*ptr2);
             }
 
-            /// not the sentence begining, because the initial hidden layer activity is from the encoder network
-                //decoderNet->ResizeAllFeatureNodes(actualMBSize);  // BUGBUG: Function was deleted, but this may be necessary.
-#if 0           // What this ^^ used to be:
+/// not the sentence begining, because the initial hidden layer activity is from the encoder network
+//decoderNet->ResizeAllFeatureNodes(actualMBSize);  // BUGBUG: Function was deleted, but this may be necessary.
+#if 0 // What this ^^ used to be:
                 // only called from MultiNetworksEvaluator
                 // a helper function for some places that like to hack the features directly
                 // This is for a few places (FindBestPath stuff) that don't follow the normal pattern but instead called the old SetFeaturesMiniBatchSize() function with a value of their choosing.
@@ -650,7 +650,7 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         size_t bSize = best_path.size();
         for (int i = 0; i < outputNodes.size(); i++)
         {
-#if 0       // This call no longer exists. This must be updated to make it functional again.
+#if 0 // This call no longer exists. This must be updated to make it functional again.
             outputNodes[i]->SetNumCols(bSize);
 #endif
             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[i])->UpdateFunctionValuesSize();
@@ -781,9 +781,9 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         ResetPreCompute();
         EvaluateBatchModeNodes(*evalnet, featureNodes);
 
-        /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
-        /// is the begining of sentence
-#if 0       // This call no longer exists. This must be updated to make it functional again.
+/// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
+/// is the begining of sentence
+#if 0 // This call no longer exists. This must be updated to make it functional again.
         for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++)
             (*ptr)->SetNumCols(1);
 #endif
@@ -925,7 +925,7 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         size_t mbSize = inputLength;
         /// use reader to initialize evalnet's sentence start information to let it know that this
         /// is the beginning of sentence
-            //evalnet->ResizeAllFeatureNodes(mbSize);    // BUGBUG: Function was deleted, but this may be necessary.
+        //evalnet->ResizeAllFeatureNodes(mbSize);    // BUGBUG: Function was deleted, but this may be necessary.
         //evalnet->SetActualMiniBatchSizeFromFeatures();
         // TODO: not setting MBLayout?
         evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
@@ -957,7 +957,7 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         /// is the begining of sentence
         // BUGBUG: This is almost certainly wrong; slice != MB size
         //evalnet->SetActualMiniBatchSize(dataReader->GetNumParallelSequences());
-            //evalnet->ResizeAllFeatureNodes(1);    // BUGBUG: Function was deleted, but this may be necessary.
+        //evalnet->ResizeAllFeatureNodes(1);    // BUGBUG: Function was deleted, but this may be necessary.
         //evalnet->SetActualMiniBatchSizeFromFeatures();
 
         double best_score = -numeric_limits<double>::infinity();
diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
index e0bf8a2e4612..166f757573db 100644
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@@ -256,7 +256,7 @@ class MultiNetworksSGD : SGD<ElemType>
             learnableNodes.push_back(*nodeIter);
 
         std::list<Matrix<ElemType>> smoothedGradients;
-#if 0   // No longer functional due to lack of GetNumCols().
+#if 0 // No longer functional due to lack of GetNumCols().
         for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
         {
             ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
@@ -556,7 +556,7 @@ class MultiNetworksSGD : SGD<ElemType>
         }
 
         std::list<Matrix<ElemType>> smoothedGradients;
-#if 0   // No longer functional due to lack of GetNumCols().
+#if 0 // No longer functional due to lack of GetNumCols().
         for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
         {
             ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 0dc2de758211..e062f0cc4304 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -20,1211 +20,1211 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    using namespace std;
+using namespace std;
 
-    // =======================================================================
-    // class SGD
-    // =======================================================================
+// =======================================================================
+// class SGD
+// =======================================================================
 
 template SGD<float>::SGD(const ConfigParameters&);
 template SGD<double>::SGD(const ConfigParameters&);
 template SGD<float>::SGD(const ScriptableObjects::IConfigRecord&);
 template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
 
-    // -----------------------------------------------------------------------
-    // Train() -- perform a multi-epoch training end-to-end with checkpointing
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// Train() -- perform a multi-epoch training end-to-end with checkpointing
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
-                              IDataReader<ElemType>* trainSetDataReader,
-                              IDataReader<ElemType>* validationSetDataReader,
-                              const bool makeMode)
+void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createNetworkFn, DEVICEID_TYPE deviceId,
+                          IDataReader<ElemType>* trainSetDataReader,
+                          IDataReader<ElemType>* validationSetDataReader,
+                          const bool makeMode)
+{
+    // determine which epoch to start with, including recoveing a checkpoint if any and 'makeMode' enabled
+    int startEpoch = DetermineStartEpoch(makeMode);
+    if (startEpoch == m_maxEpochs)
     {
-        // determine which epoch to start with, including recoveing a checkpoint if any and 'makeMode' enabled
-        int startEpoch = DetermineStartEpoch(makeMode);
-        if (startEpoch == m_maxEpochs)
-        {
-            fprintf(stderr, "No further training is necessary.\n");
-            return;
-        }
+        fprintf(stderr, "No further training is necessary.\n");
+        return;
+    }
 
-        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-        if (startEpoch >= 0)
-            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+    wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
+    if (startEpoch >= 0)
+        fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
 
-        // create or load from checkpoint
-        shared_ptr<ComputationNetwork> net = startEpoch < 0 ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
+    // create or load from checkpoint
+    shared_ptr<ComputationNetwork> net = startEpoch < 0 ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
 
-        // log the device we are computing on
-        if (net->GetDeviceId() < 0)
-            fprintf(stderr, "\nSGD using CPU.\n");
-        else
+    // log the device we are computing on
+    if (net->GetDeviceId() < 0)
+        fprintf(stderr, "\nSGD using CPU.\n");
+    else
         fprintf(stderr, "\nSGD using GPU %d.\n", (int) net->GetDeviceId());
 
-        // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
-        // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters.
+    // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
+    // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters.
 
-        startEpoch = max(startEpoch, 0);
-        m_needAdaptRegularization = false;
+    startEpoch = max(startEpoch, 0);
+    m_needAdaptRegularization = false;
 
-        TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader);
-    }
+    TrainOrAdaptModel(startEpoch, net, net, nullptr, trainSetDataReader, validationSetDataReader);
+}
 
-    // -----------------------------------------------------------------------
-    // Adapt() -- similar to Train(), but for purpose of adapting
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// Adapt() -- similar to Train(), but for purpose of adapting
+// -----------------------------------------------------------------------
 
 template <class ElemType>
-    void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
-                              IDataReader<ElemType>* trainSetDataReader,
-                              IDataReader<ElemType>* validationSetDataReader,
-                              const DEVICEID_TYPE deviceId, const bool makeMode)
+void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
+                          IDataReader<ElemType>* trainSetDataReader,
+                          IDataReader<ElemType>* validationSetDataReader,
+                          const DEVICEID_TYPE deviceId, const bool makeMode)
+{
+    int startEpoch = DetermineStartEpoch(makeMode);
+    if (startEpoch == m_maxEpochs)
     {
-        int startEpoch = DetermineStartEpoch(makeMode);
-        if (startEpoch == m_maxEpochs)
-        {
-            fprintf(stderr, "No further training is necessary.\n");
-            return;
-        }
-
-        ComputationNetworkPtr net;
-        if (startEpoch >= 0)
-        {
-            wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-            net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
-        }
-        else
-        {
-            fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
-            net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
-        }
+        fprintf(stderr, "No further training is necessary.\n");
+        return;
+    }
 
-        startEpoch = max(startEpoch, 0);
+    ComputationNetworkPtr net;
+    if (startEpoch >= 0)
+    {
+        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
+        fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+        net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
+    }
+    else
+    {
+        fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
+        net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
+    }
 
-        ComputationNetworkPtr refNet;
-        m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
-        if (m_needAdaptRegularization)
-        {
-            fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
-            refNet = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
-        }
+    startEpoch = max(startEpoch, 0);
 
-        ComputationNodeBasePtr refNode;
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
-        {
-            fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
-            if (refNodeName == L"")
-                InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
-            refNode = refNet->GetNodeFromName(refNodeName);
-        }
+    ComputationNetworkPtr refNet;
+    m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
+    if (m_needAdaptRegularization)
+    {
+        fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
+        refNet = ComputationNetwork::CreateFromFile<ElemType>(deviceId, origModelFileName);
+    }
 
-        TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
+    ComputationNodeBasePtr refNode;
+    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
+    {
+        fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
+        if (refNodeName == L"")
+            InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
+        refNode = refNet->GetNodeFromName(refNodeName);
     }
 
-    // -----------------------------------------------------------------------
-    // TrainOrAdaptModel() -- main training end-to-end, given a start model
-    // -----------------------------------------------------------------------
+    TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
+}
+
+// -----------------------------------------------------------------------
+// TrainOrAdaptModel() -- main training end-to-end, given a start model
+// -----------------------------------------------------------------------
 
-    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize);
+static double MomentumPerMB(double momentumPerSample, size_t minibatchSize);
 
 template <class ElemType>
-    void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
-                                          ComputationNetworkPtr refNet,
-                                          ComputationNodeBasePtr refNode,
-                                          IDataReader<ElemType>* trainSetDataReader,
-                                          IDataReader<ElemType>* validationSetDataReader)
-    {
+void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
+                                      ComputationNetworkPtr refNet,
+                                      ComputationNodeBasePtr refNode,
+                                      IDataReader<ElemType>* trainSetDataReader,
+                                      IDataReader<ElemType>* validationSetDataReader)
+{
     auto& featureNodes = net->FeatureNodes();
     auto& labelNodes = net->LabelNodes();
     auto& criterionNodes = GetTrainCriterionNodes(net);
 
-        fprintf(stderr, "\nTraining criterion node(s):\n");
+    fprintf(stderr, "\nTraining criterion node(s):\n");
     for (const auto& node : criterionNodes)
-            fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
+        fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
 
-        // determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
-        std::vector<ComputationNodeBasePtr> evaluationNodes;
-        {
-            auto originalEvaluationNodes = GetEvalCriterionNodes(net);
-            set<ComputationNodeBasePtr> criteriaLogged; // set to make sure we don't double-log criteria
+    // determine evaluationNodes from GetEvalCriterionNodes(), ensuring each criterion is only logged once
+    std::vector<ComputationNodeBasePtr> evaluationNodes;
+    {
+        auto originalEvaluationNodes = GetEvalCriterionNodes(net);
+        set<ComputationNodeBasePtr> criteriaLogged; // set to make sure we don't double-log criteria
         for (const auto& node : criterionNodes)
-                criteriaLogged.insert(node);
+            criteriaLogged.insert(node);
         for (const auto& node : originalEvaluationNodes)
-                if (criteriaLogged.insert(node).second)
-                    evaluationNodes.push_back(node);
+            if (criteriaLogged.insert(node).second)
+                evaluationNodes.push_back(node);
 
-            if (!evaluationNodes.empty())
-            {
-                fprintf(stderr, "\nEvaluation criterion node(s):\n");
+        if (!evaluationNodes.empty())
+        {
+            fprintf(stderr, "\nEvaluation criterion node(s):\n");
             for (const auto& node : evaluationNodes)
-                    fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
-            }
+                fprintf(stderr, "\t%ls = %ls\n", node->NodeName().c_str(), node->OperationName().c_str());
         }
+    }
 
-        std::vector<ComputationNodeBasePtr> additionalNodesToEvaluate;
-        auto& outputNodes = net->OutputNodes();
-        additionalNodesToEvaluate.insert(additionalNodesToEvaluate.end(), outputNodes.cbegin(), outputNodes.cend());
-        
-        auto preComputeNodesList = net->GetNodesRequiringPreComputation();
-        additionalNodesToEvaluate.insert(additionalNodesToEvaluate.end(), preComputeNodesList.cbegin(), preComputeNodesList.cend());
+    std::vector<ComputationNodeBasePtr> additionalNodesToEvaluate;
+    auto& outputNodes = net->OutputNodes();
+    additionalNodesToEvaluate.insert(additionalNodesToEvaluate.end(), outputNodes.cbegin(), outputNodes.cend());
 
-        // allocate memory for forward and backward computation
-        net->AllocateAllMatrices(evaluationNodes, additionalNodesToEvaluate, criterionNodes[0]);
+    auto preComputeNodesList = net->GetNodesRequiringPreComputation();
+    additionalNodesToEvaluate.insert(additionalNodesToEvaluate.end(), preComputeNodesList.cbegin(), preComputeNodesList.cend());
 
-        // get feature and label nodes into an array of matrices that will be passed to GetMinibatch()
-        // TODO: instead, remember the nodes directly, to be able to handle both float and double nodes; current version will crash for mixed networks
-        std::map<std::wstring, Matrix<ElemType>*>* inputMatrices = new std::map<std::wstring, Matrix<ElemType>*>();
-        for (size_t pass = 0; pass < 2; pass++)
-        {
+    // allocate memory for forward and backward computation
+    net->AllocateAllMatrices(evaluationNodes, additionalNodesToEvaluate, criterionNodes[0]);
+
+    // get feature and label nodes into an array of matrices that will be passed to GetMinibatch()
+    // TODO: instead, remember the nodes directly, to be able to handle both float and double nodes; current version will crash for mixed networks
+    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices = new std::map<std::wstring, Matrix<ElemType>*>();
+    for (size_t pass = 0; pass < 2; pass++)
+    {
         auto& nodes = (pass == 0) ? featureNodes : labelNodes;
-            for (size_t i = 0; i < nodes.size(); i++)
-            {
+        for (size_t i = 0; i < nodes.size(); i++)
+        {
             auto& node = nodes[i];
             auto* functionValues = &dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value();
-                assert(functionValues->GetNumCols() == net->GetMBLayoutPtr()->GetNumTimeSteps());
-                (*inputMatrices)[node->NodeName()] = functionValues;
-            }
+            assert(functionValues->GetNumCols() == net->GetMBLayoutPtr()->GetNumTimeSteps());
+            (*inputMatrices)[node->NodeName()] = functionValues;
         }
+    }
 
-        //get hmm file for sequence training
-        bool isSequenceTrainingCriterion = (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax");
-        if (isSequenceTrainingCriterion)
-        {
-            //SequenceWithSoftmaxNode<ElemType>* node = static_cast<SequenceWithSoftmaxNode<ElemType>*>(criterionNodes[0]);
-            auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
+    //get hmm file for sequence training
+    bool isSequenceTrainingCriterion = (criterionNodes[0]->OperationName() == L"SequenceWithSoftmax");
+    if (isSequenceTrainingCriterion)
+    {
+        //SequenceWithSoftmaxNode<ElemType>* node = static_cast<SequenceWithSoftmaxNode<ElemType>*>(criterionNodes[0]);
+        auto node = dynamic_pointer_cast<SequenceWithSoftmaxNode<ElemType>>(criterionNodes[0]);
         auto hmm = node->gethmm();
-            trainSetDataReader->GetHmmData(hmm);
-        }
+        trainSetDataReader->GetHmmData(hmm);
+    }
 
-        // used for KLD regularized adaptation. For all other adaptation techniques
-        // use MEL to edit the model and using normal training algorithm
-        // TODO: Should this be done in SGD::Adapt()?
-        // TODO: Redo this leveraging that we now have shared_ptrs. It is probably even OK if both networks share feature nodes.
-        // TODO: Then we can also share the MBLayout; which currently is copied by value.
-        std::vector<ComputationNodeBasePtr> refFeatureNodes;
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+    // used for KLD regularized adaptation. For all other adaptation techniques
+    // use MEL to edit the model and using normal training algorithm
+    // TODO: Should this be done in SGD::Adapt()?
+    // TODO: Redo this leveraging that we now have shared_ptrs. It is probably even OK if both networks share feature nodes.
+    // TODO: Then we can also share the MBLayout; which currently is copied by value.
+    std::vector<ComputationNodeBasePtr> refFeatureNodes;
+    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+    {
+        // replace input nodes in ref network by input nodes of the main network
+        refFeatureNodes.resize(featureNodes.size());
+        for (size_t i = 0; i < featureNodes.size(); i++)
         {
-            // replace input nodes in ref network by input nodes of the main network
-            refFeatureNodes.resize(featureNodes.size());
-            for (size_t i = 0; i < featureNodes.size(); i++)
-            {
-                // we need to keep this info to undo this later
-                // TODO: After the change to shared_ptrs, this may no longer be necessary.
-                refFeatureNodes[i] = refNet->GetNodeFromName(featureNodes[i]->NodeName());
-                refNet->ChangeNode(featureNodes[i]->NodeName(), featureNodes[i]);
-            }
-            refNet->InvalidateCompiledNetwork(); // prepare to re-compile
-            refNet->CompileNetwork();
+            // we need to keep this info to undo this later
+            // TODO: After the change to shared_ptrs, this may no longer be necessary.
+            refFeatureNodes[i] = refNet->GetNodeFromName(featureNodes[i]->NodeName());
+            refNet->ChangeNode(featureNodes[i]->NodeName(), featureNodes[i]);
+        }
+        refNet->InvalidateCompiledNetwork(); // prepare to re-compile
+        refNet->CompileNetwork();
 
-            // allocate memory for forward computation
+        // allocate memory for forward computation
         refNet->AllocateAllMatrices({refNode}, {}, nullptr);
-        }
+    }
 
-        // initializing weights and gradient holder
-        // only one criterion so far TODO: support multiple ones?
+    // initializing weights and gradient holder
+    // only one criterion so far TODO: support multiple ones?
     auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]);
-        std::list<Matrix<ElemType>> smoothedGradients;
+    std::list<Matrix<ElemType>> smoothedGradients;
 
-        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-        {
-            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-            smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
-                                                         node->Value().GetNumCols(),
-                                                         net->GetDeviceId()));
-        }
+    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+    {
+        ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+        smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
+                                                     node->Value().GetNumCols(),
+                                                     net->GetDeviceId()));
+    }
 
-        double epochCriterion, avgCriterion, prevCriterion, lrControlCriterion;
-        lrControlCriterion = epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();
-        size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
+    double epochCriterion, avgCriterion, prevCriterion, lrControlCriterion;
+    lrControlCriterion = epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();
+    size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
 
-        std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+    std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
 
-        std::vector<wstring> evalNodeNames;
-        for (size_t i = 0; i < evaluationNodes.size(); i++)
-        {
-            evalNodeNames.push_back(evaluationNodes[i]->NodeName());
-        }
+    std::vector<wstring> evalNodeNames;
+    for (size_t i = 0; i < evaluationNodes.size(); i++)
+    {
+        evalNodeNames.push_back(evaluationNodes[i]->NodeName());
+    }
 
-        size_t totalSamplesSeen = 0;
-        double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
+    size_t totalSamplesSeen = 0;
+    double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
 
-        double learningRateAdjustmentFactor = 1.0f;
-        vector<double> prevLearnRates;
-        prevLearnRates.resize(m_numPrevLearnRates);
-        for (int i = 0; i < m_numPrevLearnRates; i++)
-        {
-             prevLearnRates[i] = -1.0;
-        }
+    double learningRateAdjustmentFactor = 1.0f;
+    vector<double> prevLearnRates;
+    prevLearnRates.resize(m_numPrevLearnRates);
+    for (int i = 0; i < m_numPrevLearnRates; i++)
+    {
+        prevLearnRates[i] = -1.0;
+    }
 
-        if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
+    if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
+    {
+        InitDistGradAgg(evaluationNodes.size(), m_traceLevel);
+    }
+    //precompute mean and invStdDev nodes and save initial model
+    if (PreCompute(net, trainSetDataReader, featureNodes, labelNodes, inputMatrices) || startEpoch == 0)
+    {
+        // Synchronize all ranks before writing the model to ensure that
+        // everyone is done loading the model
+        if (g_mpi != nullptr)
         {
-            InitDistGradAgg(evaluationNodes.size(), m_traceLevel);
+            g_mpi->WaitAll();
         }
-        //precompute mean and invStdDev nodes and save initial model
-        if (PreCompute(net, trainSetDataReader, featureNodes, labelNodes, inputMatrices) || startEpoch == 0)
-        {
-            // Synchronize all ranks before writing the model to ensure that 
-            // everyone is done loading the model
-            if (g_mpi != nullptr)
-            {
-                g_mpi->WaitAll();
-            }
 
-            net->Save(GetModelNameForEpoch(int(startEpoch) - 1));
-        }
+        net->Save(GetModelNameForEpoch(int(startEpoch) - 1));
+    }
 
-        bool learnRateInitialized = false;
-        if (startEpoch > 0)
-        {
-            learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
-                                                      /*out*/ totalSamplesSeen,
-                                                      /*out*/ learnRatePerSample,
-                                                      smoothedGradients,
-                                                      /*out*/ prevCriterion,
-                                                      /*out*/ m_prevChosenMinibatchSize);
-            if (learnRateInitialized)
-                prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
-        }
+    bool learnRateInitialized = false;
+    if (startEpoch > 0)
+    {
+        learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
+                                                  /*out*/ totalSamplesSeen,
+                                                  /*out*/ learnRatePerSample,
+                                                  smoothedGradients,
+                                                  /*out*/ prevCriterion,
+                                                  /*out*/ m_prevChosenMinibatchSize);
+        if (learnRateInitialized)
+            prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
+    }
 
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
-            !learnRateInitialized && m_learningRatesParam.size() <= startEpoch)
-        {
-            InvalidArgument(
-                "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, "
-                "or an explicit learning rate must be specified in config for the starting epoch.");
-        }
+    if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
+        !learnRateInitialized && m_learningRatesParam.size() <= startEpoch)
+    {
+        InvalidArgument(
+            "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, "
+            "or an explicit learning rate must be specified in config for the starting epoch.");
+    }
 
-        unsigned long dropOutSeed = 1;
-        double prevDropoutRate = 0;
+    unsigned long dropOutSeed = 1;
+    double prevDropoutRate = 0;
 
-        bool learnRateReduced = false;
+    bool learnRateReduced = false;
 
-        // pass user config on memory allocation for convolution operations to the Network
-        ComputationNetwork::SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
-        {
-            ComputationNetwork::SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
-        }
+    // pass user config on memory allocation for convolution operations to the Network
+    ComputationNetwork::SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
+    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
+    {
+        ComputationNetwork::SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
+    }
 
-        // likewise for sequence training parameters
-        if (isSequenceTrainingCriterion)
-        {
-            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, 
+    // likewise for sequence training parameters
+    if (isSequenceTrainingCriterion)
+    {
+        ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign,
                                                   m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR);
-        }
+    }
 
-        // --- MAIN EPOCH LOOP
+    // --- MAIN EPOCH LOOP
     for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
+    {
+        // Synchronize all ranks before proceeding to ensure that
+        // rank 0 has finished writing the previous model file
+        if (g_mpi != nullptr)
         {
-            // Synchronize all ranks before proceeding to ensure that 
-            // rank 0 has finished writing the previous model file
-            if (g_mpi != nullptr)
-            {
-                g_mpi->WaitAll();
-            }
+            g_mpi->WaitAll();
+        }
 
-            Timer timer;
-            timer.Start();
+        Timer timer;
+        timer.Start();
 
-            // set dropout rate for this epoch
-            ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+        // set dropout rate for this epoch
+        ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
 
-            // learning rate adjustment
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || i < m_learningRatesParam.size())
-            {
-                // BUGBUG: GetNumParallelSequences() returns 1 under certain situations; it seems when restarting from checkpoint
+        // learning rate adjustment
+        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || i < m_learningRatesParam.size())
+        {
+            // BUGBUG: GetNumParallelSequences() returns 1 under certain situations; it seems when restarting from checkpoint
             learnRatePerSample = GetLearningRatePerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequences());
-            }
-            else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+        }
+        else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+        {
+            double largestPrevLearnRatePerSample = prevLearnRates[0];
+            for (int j = 1; j < m_numPrevLearnRates; j++)
             {
-                double largestPrevLearnRatePerSample = prevLearnRates[0];
-                for (int j = 1; j < m_numPrevLearnRates; j++)
-                {
-                    largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
-                }
-
-                // return a reasonable learning rate based on the initial minibatchSize
-                double newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample,
-                                                                           trainSetDataReader, featureNodes, labelNodes,
-                                                                           criterionNodes, evaluationNodes, inputMatrices,
-                                                                           learnableNodes, smoothedGradients,
-                                                                           learnRateInitialized, largestPrevLearnRatePerSample);
-                learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample;
-                learnRatePerSample = newLearningRatePerSample;
-
-                // save per sample learn rate to support changeable minibatchSize
-                prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;
+                largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
             }
 
-            learnRateInitialized = true;
+            // return a reasonable learning rate based on the initial minibatchSize
+            double newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample,
+                                                                     trainSetDataReader, featureNodes, labelNodes,
+                                                                     criterionNodes, evaluationNodes, inputMatrices,
+                                                                     learnableNodes, smoothedGradients,
+                                                                     learnRateInitialized, largestPrevLearnRatePerSample);
+            learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample;
+            learnRatePerSample = newLearningRatePerSample;
 
-            if (learnRatePerSample < m_minLearnRate)
-            {
-                fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
-                        i + 1, learnRatePerSample, m_minLearnRate);
-                if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
-                {
-                    net->Save(m_modelPath);
-                }
-                break;
-            }
+            // save per sample learn rate to support changeable minibatchSize
+            prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;
+        }
 
-            size_t chosenMinibatchSize;
-            size_t actualMinibatchSize;
+        learnRateInitialized = true;
 
-            // Through the command line or config file the user can set minibatch sizes on a per epoch
-            // basis for a set number of epochs.  For epochs after that point, m_mbSize.size(), either
-            // we just keep using
-            // the last minibatch size, or we use tuning to try and find a better one.
-            if (m_autoAdjustMinibatch && i >= m_mbSize.size())
+        if (learnRatePerSample < m_minLearnRate)
+        {
+            fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training complete.\n",
+                    i + 1, learnRatePerSample, m_minLearnRate);
+            if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
             {
-                size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i];
-                if (m_epochSize != requestDataSize)
-                {
-                    // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
-                    numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
-                }
-
-                // Use tuning to try and find a better minibatch size
-                chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i,
-                                                              numFramesToUseInSearch,
-                                                              trainSetDataReader, learnRatePerSample,
-                                                              m_mbSize[i], featureNodes, labelNodes,
-                                                              criterionNodes, evaluationNodes,
-                                                              inputMatrices, learnableNodes,
-                                                              smoothedGradients, learningRateAdjustmentFactor);
-                m_prevChosenMinibatchSize = chosenMinibatchSize;
+                net->Save(m_modelPath);
             }
-            else
+            break;
+        }
+
+        size_t chosenMinibatchSize;
+        size_t actualMinibatchSize;
+
+        // Through the command line or config file the user can set minibatch sizes on a per epoch
+        // basis for a set number of epochs.  For epochs after that point, m_mbSize.size(), either
+        // we just keep using
+        // the last minibatch size, or we use tuning to try and find a better one.
+        if (m_autoAdjustMinibatch && i >= m_mbSize.size())
+        {
+            size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i];
+            if (m_epochSize != requestDataSize)
             {
-                // use the explicitly set minibatch size
-                chosenMinibatchSize = m_mbSize[i];
+                // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+                numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
             }
 
+            // Use tuning to try and find a better minibatch size
+            chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i,
+                                                          numFramesToUseInSearch,
+                                                          trainSetDataReader, learnRatePerSample,
+                                                          m_mbSize[i], featureNodes, labelNodes,
+                                                          criterionNodes, evaluationNodes,
+                                                          inputMatrices, learnableNodes,
+                                                          smoothedGradients, learningRateAdjustmentFactor);
+            m_prevChosenMinibatchSize = chosenMinibatchSize;
+        }
+        else
+        {
+            // use the explicitly set minibatch size
+            chosenMinibatchSize = m_mbSize[i];
+        }
+
         actualMinibatchSize = FixUpEffectiveMBSize(chosenMinibatchSize /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequences());
 
         double momentumPerSample = GetMomentumPerSample(i /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequences());
-            // time constant = number of samples after which a contribution has been reduced to e^-1
-            double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
-                                          : momentumPerSample >= 1.0 ? 0.0
-                                          : -1.0 / log(momentumPerSample);
-            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  effective momentum = %f  momentum as time constant = %.1f samples\n",
-                    i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
-
-            TrainOneEpoch(net,
-                          refNet, 
-                          refNode, 
-                          i, 
-                          m_epochSize,
-                          trainSetDataReader, 
-                          learnRatePerSample, 
-                          chosenMinibatchSize, 
-                          featureNodes,
-                          labelNodes, 
-                          criterionNodes, 
-                          evaluationNodes,
-                          inputMatrices, 
-                          learnableNodes, smoothedGradients,
-                          epochCriterion, epochEvalErrors, totalSamplesSeen);
-
-            timer.Stop();
-            double epochTime = timer.ElapsedSeconds();
-
-            if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0)
-            {
-                lrControlCriterion = epochEvalErrors[0];
-            }
-            else
-            {
-                lrControlCriterion = epochCriterion;
-            }
+        // time constant = number of samples after which a contribution has been reduced to e^-1
+        double momentumAsTimeConstant = momentumPerSample == 0.0 ? 0.0
+                                                                 : momentumPerSample >= 1.0 ? 0.0
+                                                                                            : -1.0 / log(momentumPerSample);
+        fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  effective momentum = %f  momentum as time constant = %.1f samples\n",
+                i + 1, learnRatePerSample, MomentumPerMB(momentumPerSample, actualMinibatchSize), momentumAsTimeConstant);
+
+        TrainOneEpoch(net,
+                      refNet,
+                      refNode,
+                      i,
+                      m_epochSize,
+                      trainSetDataReader,
+                      learnRatePerSample,
+                      chosenMinibatchSize,
+                      featureNodes,
+                      labelNodes,
+                      criterionNodes,
+                      evaluationNodes,
+                      inputMatrices,
+                      learnableNodes, smoothedGradients,
+                      epochCriterion, epochEvalErrors, totalSamplesSeen);
+
+        timer.Stop();
+        double epochTime = timer.ElapsedSeconds();
+
+        if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0)
+        {
+            lrControlCriterion = epochEvalErrors[0];
+        }
+        else
+        {
+            lrControlCriterion = epochCriterion;
+        }
 
-            fprintf(stderr,
-                    "Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; ",
-                    i + 1, (int) m_maxEpochs, epochCriterion);
-            m_lastFinishedEpochTrainLoss = epochCriterion;
+        fprintf(stderr,
+                "Finished Epoch[%2d of %d]: [Training Set] TrainLossPerSample = %.8g; ",
+                i + 1, (int) m_maxEpochs, epochCriterion);
+        m_lastFinishedEpochTrainLoss = epochCriterion;
         if (epochEvalErrors.size() == 0) // no eval criterion, only train criterion itself
+        {
+            fprintf(stderr,
+                    "AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
+                    learnRatePerSample, epochTime);
+        }
+        else if (epochEvalErrors.size() == 1)
+        {
+            fprintf(stderr,
+                    "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
+                    epochEvalErrors[0], learnRatePerSample, epochTime);
+        }
+        else
+        {
+            fprintf(stderr, "EvalErrPerSample ");
+            for (size_t j = 0; j < epochEvalErrors.size(); j++)
             {
-                fprintf(stderr,
-                        "AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
-                        learnRatePerSample, epochTime);
+                fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
             }
-            else if (epochEvalErrors.size() == 1)
+
+            fprintf(stderr, "AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
+                    learnRatePerSample, epochTime);
+
+            // TODO: why these extra log messages here and not for 1 eval criterion?
+            fprintf(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
+                    i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
+
+            for (size_t j = 0; j < epochEvalErrors.size(); j++)
             {
-                fprintf(stderr,
-                        "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
-                        epochEvalErrors[0], learnRatePerSample, epochTime);
+                fprintf(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
+                        i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
             }
-            else
+        }
+
+        if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+        {
+            if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
             {
-                fprintf(stderr, "EvalErrPerSample ");
-                for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                SimpleEvaluator<ElemType> evalforvalidation(net);
+                vector<wstring> cvSetTrainAndEvalNodes;
+                if (criterionNodes.size() > 0)
                 {
-                    fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
+                    cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
+                }
+                if (evaluationNodes.size() > 0)
+                {
+                    cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
                 }
 
-                fprintf(stderr, "AvgLearningRatePerSample = %.8g; EpochTime=%.6g\n",
-                        learnRatePerSample, epochTime);
-
-                // TODO: why these extra log messages here and not for 1 eval criterion?
-                fprintf(stderr, "Finished Epoch[%2d of %d]: Criterion Node [%ls] Per Sample = %.8g\n",
-                                i + 1, (int) m_maxEpochs, criterionNodes[0]->NodeName().c_str(), epochCriterion);
-
-                for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
+                fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
+                if (vScore.size() > 1)
                 {
-                    fprintf(stderr, "Finished Epoch[%2d of %d]: Evaluation Node [%ls] Per Sample = %.8g\n",
-                            i + 1, (int) m_maxEpochs, evalNodeNames[j].c_str(), epochEvalErrors[j]);
+                    fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
                 }
-            }
+                fprintf(stderr, "\n");
 
-            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
-            {
-                if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
+                if (m_useCVSetControlLRIfCVExists)
                 {
-                    SimpleEvaluator<ElemType> evalforvalidation(net);
-                    vector<wstring> cvSetTrainAndEvalNodes;
-                    if (criterionNodes.size() > 0)
+                    if (m_useEvalCriterionControlLR && vScore.size() > 1)
                     {
-                        cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
+                        lrControlCriterion = vScore[1];
                     }
-                    if (evaluationNodes.size() > 0)
-                    {
-                        cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
-                    }
-
-                    vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
-                    fprintf(stderr, "Finished Epoch[%2d of %d]: [Validation Set] TrainLossPerSample = %.8g", i + 1, (int) m_maxEpochs, vScore[0]);
-                    if (vScore.size() > 1)
+                    else
                     {
-                        fprintf(stderr, "; EvalErrPerSample = %.8g", vScore[1]);
-                    }
-                    fprintf(stderr, "\n");
-
-                    if (m_useCVSetControlLRIfCVExists)
-                    {
-                        if (m_useEvalCriterionControlLR && vScore.size() > 1)
-                        {
-                            lrControlCriterion = vScore[1];
-                        }
-                        else
-                        {
-                            lrControlCriterion = vScore[0]; // the first one is the training criterion
-                        }
+                        lrControlCriterion = vScore[0]; // the first one is the training criterion
                     }
                 }
             }
+        }
 
-            // broadcast epochCriterion to make sure each processor will have the same learning rate schedule
-            if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
-            {
-                g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
-                g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
-            }
+        // broadcast epochCriterion to make sure each processor will have the same learning rate schedule
+        if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
+        {
+            g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
+            g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
+        }
 
-            bool loadedPrevModel = false;
-            size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
-            if (avgCriterion == std::numeric_limits<double>::infinity())
-            {
-                avgCriterion = lrControlCriterion;
-            }
-            else
-            {
-                avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) *
+        bool loadedPrevModel = false;
+        size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
+        if (avgCriterion == std::numeric_limits<double>::infinity())
+        {
+            avgCriterion = lrControlCriterion;
+        }
+        else
+        {
+            avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) *
                                 avgCriterion +
                             lrControlCriterion) /
-                                (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
-            }
+                           (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
+        }
 
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
-                m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
+            m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+        {
+            if (std::isnan(avgCriterion) || (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity()))
             {
-                if (std::isnan(avgCriterion) || (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity()))
+                if (m_loadBestModel)
                 {
-                    if (m_loadBestModel)
-                    {
-                        auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
-                        fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
-                        net->RereadPersistableParameters<ElemType>(bestModelPath);
-                        LoadCheckPointInfo(i - m_learnRateAdjustInterval,
-                                           /*out*/ totalSamplesSeen,
-                                           /*out*/ learnRatePerSample,
-                                           smoothedGradients,
-                                           /*out*/ prevCriterion,
-                                           /*out*/ m_prevChosenMinibatchSize);
-                        loadedPrevModel = true;
-                    }
+                    auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
+                    fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
+                    net->RereadPersistableParameters<ElemType>(bestModelPath);
+                    LoadCheckPointInfo(i - m_learnRateAdjustInterval,
+                                       /*out*/ totalSamplesSeen,
+                                       /*out*/ learnRatePerSample,
+                                       smoothedGradients,
+                                       /*out*/ prevCriterion,
+                                       /*out*/ m_prevChosenMinibatchSize);
+                    loadedPrevModel = true;
                 }
+            }
 
-                if (m_continueReduce)
+            if (m_continueReduce)
+            {
+                if (std::isnan(avgCriterion) ||
+                    (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
+                     prevCriterion != std::numeric_limits<double>::infinity()))
                 {
-                    if (std::isnan(avgCriterion) || 
-                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
-                        prevCriterion != std::numeric_limits<double>::infinity()))
+                    if (learnRateReduced == false)
                     {
-                        if (learnRateReduced == false)
-                        {
-                            learnRateReduced = true;
-                        }
-                        else
-                        {
-                            net->Save(GetModelNameForEpoch(i, true));
-
-                            fprintf(stderr, "Finished training and saved final model\n\n");
-                            break;
-                        }
+                        learnRateReduced = true;
                     }
-
-                    if (learnRateReduced)
+                    else
                     {
-                        learnRatePerSample *= m_learnRateDecreaseFactor;
-                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                        net->Save(GetModelNameForEpoch(i, true));
+
+                        fprintf(stderr, "Finished training and saved final model\n\n");
+                        break;
                     }
                 }
-                else
-                {
-                    if (std::isnan(avgCriterion) || 
-                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
-                        prevCriterion != std::numeric_limits<double>::infinity()))
-                    {
 
-                        learnRatePerSample *= m_learnRateDecreaseFactor;
-                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
-                    }
-                    else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
-                             prevCriterion != std::numeric_limits<double>::infinity())
-                    {
-                        learnRatePerSample *= m_learnRateIncreaseFactor;
-                        fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
-                    }
+                if (learnRateReduced)
+                {
+                    learnRatePerSample *= m_learnRateDecreaseFactor;
+                    fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
                 }
             }
             else
             {
-                if (std::isnan(avgCriterion))
-                    RuntimeError("The training criterion is not a number (NAN). Stop\n");
-                }
+                if (std::isnan(avgCriterion) ||
+                    (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
+                     prevCriterion != std::numeric_limits<double>::infinity()))
+                {
 
-            // not loading previous values then set them
-            if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
-            {
-                prevCriterion = avgCriterion;
-                epochsNotCountedInAvgCriterion = 0;
+                    learnRatePerSample *= m_learnRateDecreaseFactor;
+                    fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                }
+                else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
+                         prevCriterion != std::numeric_limits<double>::infinity())
+                {
+                    learnRatePerSample *= m_learnRateIncreaseFactor;
+                    fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
+                }
             }
+        }
+        else
+        {
+            if (std::isnan(avgCriterion))
+                RuntimeError("The training criterion is not a number (NAN). Stop\n");
+        }
 
-            // Synchronize all ranks before proceeding to ensure that 
-            // nobody tries reading the checkpoint file at the same time
-            // as rank 0 deleting it below
-            if (g_mpi != nullptr)
-            {
-                g_mpi->WaitAll();
-            }
+        // not loading previous values then set them
+        if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+        {
+            prevCriterion = avgCriterion;
+            epochsNotCountedInAvgCriterion = 0;
+        }
+
+        // Synchronize all ranks before proceeding to ensure that
+        // nobody tries reading the checkpoint file at the same time
+        // as rank 0 deleting it below
+        if (g_mpi != nullptr)
+        {
+            g_mpi->WaitAll();
+        }
 
-            // persist model and check-point info
-            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+        // persist model and check-point info
+        if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+        {
+            net->Save(GetModelNameForEpoch(i));
+            SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
+            if (!m_keepCheckPointFiles)
             {
-                net->Save(GetModelNameForEpoch(i));
-                SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
-                if (!m_keepCheckPointFiles)
+                // delete previous checkpoint file to save space
+                if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_loadBestModel)
                 {
-                    // delete previous checkpoint file to save space
-                    if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_loadBestModel)
+                    if (epochsSinceLastLearnRateAdjust != 1)
                     {
-                        if (epochsSinceLastLearnRateAdjust != 1)
-                        {
-                            _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
-                        }
-                        if (epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
-                        {
-                            _wunlink(GetCheckPointFileNameForEpoch(i - m_learnRateAdjustInterval).c_str());
-                        }
+                        _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
                     }
-                    else
+                    if (epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
                     {
-                        _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
+                        _wunlink(GetCheckPointFileNameForEpoch(i - m_learnRateAdjustInterval).c_str());
                     }
                 }
-            }
-
-            if (learnRatePerSample < 1e-12)
-            {
-                fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
-                        learnRatePerSample);
+                else
+                {
+                    _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
+                }
             }
         }
-        // --- END OF MAIN EPOCH LOOP
 
-        // Synchronize all ranks before proceeding to ensure that 
-        // rank 0 has finished writing the model file
-        if (g_mpi != nullptr)
+        if (learnRatePerSample < 1e-12)
         {
-            g_mpi->WaitAll();
+            fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
+                    learnRatePerSample);
         }
+    }
+    // --- END OF MAIN EPOCH LOOP
+
+    // Synchronize all ranks before proceeding to ensure that
+    // rank 0 has finished writing the model file
+    if (g_mpi != nullptr)
+    {
+        g_mpi->WaitAll();
+    }
 
-        // progress tracing for compute cluster management
-        ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
-        ProgressTracing::TraceTrainLoss(m_lastFinishedEpochTrainLoss);
+    // progress tracing for compute cluster management
+    ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
+    ProgressTracing::TraceTrainLoss(m_lastFinishedEpochTrainLoss);
 
-        // since we linked feature nodes. we need to remove it from the deletion
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+    // since we linked feature nodes. we need to remove it from the deletion
+    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+    {
+        for (size_t i = 0; i < refFeatureNodes.size(); i++)
         {
-            for (size_t i = 0; i < refFeatureNodes.size(); i++)
-            {
-                // note we need to handle deletion carefully
-                refNet->ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]);
-            }
+            // note we need to handle deletion carefully
+            refNet->ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]);
         }
-
-        delete inputMatrices;
     }
 
-    // -----------------------------------------------------------------------
-    // TrainOneEpoch() -- train one epoch
-    // -----------------------------------------------------------------------
+    delete inputMatrices;
+}
+
+// -----------------------------------------------------------------------
+// TrainOneEpoch() -- train one epoch
+// -----------------------------------------------------------------------
 
-    static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value);
+static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value);
 
 template <class ElemType>
-    size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
-                                        ComputationNetworkPtr refNet,
-                                        const ComputationNodeBasePtr& refNode,
-                                        const int epochNumber,
-                                        const size_t epochSize,
-                                        IDataReader<ElemType>* trainSetDataReader,
-                                        const double learnRatePerSample,
-                                        size_t tunedMBSize,
+size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
+                                    ComputationNetworkPtr refNet,
+                                    const ComputationNodeBasePtr& refNode,
+                                    const int epochNumber,
+                                    const size_t epochSize,
+                                    IDataReader<ElemType>* trainSetDataReader,
+                                    const double learnRatePerSample,
+                                    size_t tunedMBSize,
                                     const std::vector<ComputationNodeBasePtr>& featureNodes,
                                     const std::vector<ComputationNodeBasePtr>& labelNodes,
                                     const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                     const std::vector<ComputationNodeBasePtr>& evaluationNodes,
                                     std::map<std::wstring, Matrix<ElemType>*>* inputMatrices, // TODO: why is this a pointer?
                                     const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                        std::list<Matrix<ElemType>>& smoothedGradients,
-                                        /*out*/ double& epochCriterion,
-                                        /*out*/ std::vector<double>& epochEvalErrors,
-                                        /*out*/ size_t& totalSamplesSeen,
-                                        std::string prefixMsg)
-    {
+                                    std::list<Matrix<ElemType>>& smoothedGradients,
+                                    /*out*/ double& epochCriterion,
+                                    /*out*/ std::vector<double>& epochEvalErrors,
+                                    /*out*/ size_t& totalSamplesSeen,
+                                    std::string prefixMsg)
+{
     double totalTimeInMBs = 0; // use double since timer has sub-microsecond time resolution
-        double epochCriterionLastMBs = 0;
+    double epochCriterionLastMBs = 0;
 
-        int numSamplesLastMBs = 0;
-        std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
+    int numSamplesLastMBs = 0;
+    std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
 
-        // initialize statistics
-        size_t totalEpochSamples = 0;
+    // initialize statistics
+    size_t totalEpochSamples = 0;
 
-        int numMBsRun = 0;
+    int numMBsRun = 0;
 
-        // NOTE: the following two local matrices are not used in distGradAgg path
-        // assume only one training criterion node for each epoch.
-        // The criterion values are accumulated here over the minibatches (without having to pull them off the GPU).
-        Matrix<ElemType> localEpochCriterion(1, 1, net->GetDeviceId());
-        Matrix<ElemType> localEpochEvalErrors(1, epochEvalErrors.size(), net->GetDeviceId());
+    // NOTE: the following two local matrices are not used in distGradAgg path
+    // assume only one training criterion node for each epoch.
+    // The criterion values are accumulated here over the minibatches (without having to pull them off the GPU).
+    Matrix<ElemType> localEpochCriterion(1, 1, net->GetDeviceId());
+    Matrix<ElemType> localEpochEvalErrors(1, epochEvalErrors.size(), net->GetDeviceId());
 
-        localEpochCriterion.SetValue(0);
-        localEpochEvalErrors.SetValue(0);
+    localEpochCriterion.SetValue(0);
+    localEpochEvalErrors.SetValue(0);
 
-        bool useGradientAggregation = ((m_parallelizationMethod == ParallelizationMethod::DataParallelSGD) &&
-                                       (epochNumber >= m_parallelizationStartEpochNum));
-        bool useModelAveraging = ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) &&
-                                  (epochNumber >= m_parallelizationStartEpochNum));
-        bool useParallelTrain = useGradientAggregation || useModelAveraging; 
+    bool useGradientAggregation = ((m_parallelizationMethod == ParallelizationMethod::DataParallelSGD) &&
+                                   (epochNumber >= m_parallelizationStartEpochNum));
+    bool useModelAveraging = ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) &&
+                              (epochNumber >= m_parallelizationStartEpochNum));
+    bool useParallelTrain = useGradientAggregation || useModelAveraging;
 
-        // MA-related variables
-        size_t nSamplesSinceLastModelSync = 0;
-        size_t nSynced = 0; 
+    // MA-related variables
+    size_t nSamplesSinceLastModelSync = 0;
+    size_t nSynced = 0;
     float nSecondsOnMASync = 0;
     float nSecondsSinceLastMAPerfReport = 0;
 
-        std::vector<Matrix<ElemType>*> learnParamsGradients;
-        if (useGradientAggregation)
-        {
-            epochCriterion = double(0.0);
-            epochEvalErrors.assign(epochEvalErrors.size(), double(0.0));
-        }
+    std::vector<Matrix<ElemType>*> learnParamsGradients;
+    if (useGradientAggregation)
+    {
+        epochCriterion = double(0.0);
+        epochEvalErrors.assign(epochEvalErrors.size(), double(0.0));
+    }
 
-        Profiler profiler(m_numMBsToCUDAProfile);
+    Profiler profiler(m_numMBsToCUDAProfile);
 
-        // resetting this, so profiling is performed for one epoch only
-        m_numMBsToCUDAProfile = 0;
+    // resetting this, so profiling is performed for one epoch only
+    m_numMBsToCUDAProfile = 0;
 
-        bool useDistributedMBReading = useParallelTrain &&
-                                       m_enableDistributedMBReading &&
-                                       trainSetDataReader->SupportsDistributedMBRead();
-        if (useDistributedMBReading)
-        {
-            trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(),
-                                                              g_mpi->NumNodesInUse(), epochSize);
-        }
-        else
-        {
-            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, epochSize);
-        }
+    bool useDistributedMBReading = useParallelTrain &&
+                                   m_enableDistributedMBReading &&
+                                   trainSetDataReader->SupportsDistributedMBRead();
+    if (useDistributedMBReading)
+    {
+        trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(),
+                                                          g_mpi->NumNodesInUse(), epochSize);
+    }
+    else
+    {
+        trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, epochSize);
+    }
 
-        net->StartEvaluateMinibatchLoop(evaluationNodes);
-        net->StartEvaluateMinibatchLoop(criterionNodes);
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
-        {
-            refNet->StartEvaluateMinibatchLoop(refNode);
-        }
+    net->StartEvaluateMinibatchLoop(evaluationNodes);
+    net->StartEvaluateMinibatchLoop(criterionNodes);
+    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
+    {
+        refNet->StartEvaluateMinibatchLoop(refNode);
+    }
 
-        // prepare for sub-minibatching
-        // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
-        DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
-        size_t numSubminibatchesNeeded = 0; 
+    // prepare for sub-minibatching
+    // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
+    DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
+    size_t numSubminibatchesNeeded = 0;
     if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
+    {
+        if (m_maxSamplesInRAM < SIZE_MAX)
         {
-            if (m_maxSamplesInRAM < SIZE_MAX)
-            {
-                // into how many pieces would we need to break the minibatch?
-                // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
-                size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
-                size_t estimatedMBSize = tunedMBSize * numParallelSequences;
+            // into how many pieces would we need to break the minibatch?
+            // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
+            size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
+            size_t estimatedMBSize = tunedMBSize * numParallelSequences;
             numSubminibatchesNeeded = (size_t) std::ceil((float) estimatedMBSize / m_maxSamplesInRAM);
-            }
-            if (m_numSubminiBatches > 1)
-            {
-                numSubminibatchesNeeded = m_numSubminiBatches;
-            }
         }
-        // this is non-trivial, we need a manager object to handle this
-        if (numSubminibatchesNeeded > 1)
-            smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
+        if (m_numSubminiBatches > 1)
+        {
+            numSubminibatchesNeeded = m_numSubminiBatches;
+        }
+    }
+    // this is non-trivial, we need a manager object to handle this
+    if (numSubminibatchesNeeded > 1)
+        smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
 
-        // The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
-        // This attemps to compute the error signal for the whole utterance, which will
-        // be fed to the neural network as features. Currently it is a workaround
-        // for the two-forward-pass sequence and ctc training, which allows
-        // processing more utterances at the same time.
-        // TODO: move the two-forward-pass support out of the reader, make a first-class citizen.
-        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
+    // The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
+    // This attemps to compute the error signal for the whole utterance, which will
+    // be fed to the neural network as features. Currently it is a workaround
+    // for the two-forward-pass sequence and ctc training, which allows
+    // processing more utterances at the same time.
+    // TODO: move the two-forward-pass support out of the reader, make a first-class citizen.
+    AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
 
-        fprintf(stderr, "\nStarting minibatch loop");
-        if (useGradientAggregation)
-        {
-            fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
+    fprintf(stderr, "\nStarting minibatch loop");
+    if (useGradientAggregation)
+    {
+        fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)",
                 (int) g_mpi->CurrentNodeRank(), (int) g_mpi->NumNodesInUse(), (int) m_numGradientBits);
-            if (m_bufferedAsyncGradientAggregation)
-            {
-                fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
-            }
-        }
-        if (useDistributedMBReading)
+        if (m_bufferedAsyncGradientAggregation)
         {
-            fprintf(stderr, ", distributed reading is ENABLED");
+            fprintf(stderr, ", BufferedAsyncGradientAggregation is ENABLED");
         }
-        if (numSubminibatchesNeeded > 1)
-        {
-            if (m_maxSamplesInRAM < SIZE_MAX)
+    }
+    if (useDistributedMBReading)
+    {
+        fprintf(stderr, ", distributed reading is ENABLED");
+    }
+    if (numSubminibatchesNeeded > 1)
+    {
+        if (m_maxSamplesInRAM < SIZE_MAX)
             fprintf(stderr, ", with maximum %d samples in RAM", (int) m_maxSamplesInRAM);
-            else
+        else
             fprintf(stderr, ", with %d subminibatch", (int) numSubminibatchesNeeded);
-        }
-        fprintf(stderr, ".\n");
+    }
+    fprintf(stderr, ".\n");
 
-        Timer timer;
-        timer.Start();
+    Timer timer;
+    timer.Start();
 
-        // --- MAIN MINIBATCH LOOP
+    // --- MAIN MINIBATCH LOOP
 
-        bool noMoreSamplesToProcess = false;
-        for (;;)
-        {
-            // get minibatch
-            // TODO: is it guaranteed that the GPU is already completed at this point, is it safe to overwrite the buffers?
-            size_t actualMBSize = 0;
-            bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, criterionNodes[0],
-                                                                          useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize);
+    bool noMoreSamplesToProcess = false;
+    for (;;)
+    {
+        // get minibatch
+        // TODO: is it guaranteed that the GPU is already completed at this point, is it safe to overwrite the buffers?
+        size_t actualMBSize = 0;
+        bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, criterionNodes[0],
+                                                                      useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize);
         if (!wasDataRead && (!useDistributedMBReading || noMoreSamplesToProcess)) // in case of distributed reading, we do a few more loops until all ranks have completed
             break;                                                                // end of epoch
 
-            // Note: If !wasDataRead then the data that GetMinibatchIntoNetwork() was supposed to full in are undefined.
-            // Must not touch them.
+        // Note: If !wasDataRead then the data that GetMinibatchIntoNetwork() was supposed to full in are undefined.
+        // Must not touch them.
 
-            if (!wasDataRead)
+        if (!wasDataRead)
             actualMBSize = 0; // (undefined if !wasDataRead)
 
-            nSamplesSinceLastModelSync += actualMBSize;
+        nSamplesSinceLastModelSync += actualMBSize;
 
-            // node data was changed
-            // TODO: move this to that function as well--just tired to pass everything as arguments
-            // TODO: We should do this right after the GetMinibatch() call, since that's where these changed.
-            //       Need to check whether that would cause unintended side effects.
-            // TODO: original code did not call this for actualMBSize == 0
-            ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-            ComputationNetwork::BumpEvalTimeStamp(labelNodes);
+        // node data was changed
+        // TODO: move this to that function as well--just tired to pass everything as arguments
+        // TODO: We should do this right after the GetMinibatch() call, since that's where these changed.
+        //       Need to check whether that would cause unintended side effects.
+        // TODO: original code did not call this for actualMBSize == 0
+        ComputationNetwork::BumpEvalTimeStamp(featureNodes);
+        ComputationNetwork::BumpEvalTimeStamp(labelNodes);
 
-            if (actualMBSize > 0)
-            {
-                assert(wasDataRead);
+        if (actualMBSize > 0)
+        {
+            assert(wasDataRead);
 #ifndef EVALDLL
-                if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
-                    LogicError("cannot pass gradient checker");
+            if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
+                LogicError("cannot pass gradient checker");
 #endif
-                // TODO: currently we only support one node for regularization
-                if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
-                {
-                    size_t actualMBSize2 = refNet->DetermineActualMBSizeFromFeatures();
+            // TODO: currently we only support one node for regularization
+            if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
+            {
+                size_t actualMBSize2 = refNet->DetermineActualMBSizeFromFeatures();
                 refNet->GetMBLayoutPtr()->CopyFrom(net->GetMBLayoutPtr()); // TODO: This is UNTESTED (before this was missing, seemingly inconsistently)
-                    refNet->VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+                refNet->VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
 
-                    if (actualMBSize2 != actualMBSize)
-                        LogicError("TrainOneEpoch: refNet has different MB size than main net??");
+                if (actualMBSize2 != actualMBSize)
+                    LogicError("TrainOneEpoch: refNet has different MB size than main net??");
 
-                    refNet->ForwardProp(refNode);
+                refNet->ForwardProp(refNode);
                 Matrix<ElemType>::ScaleAndAdd((ElemType) m_adaptationRegWeight,
-                                                  dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->Value(),
-                                                  (ElemType)(1.0 - m_adaptationRegWeight),
-                                                  dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[0])->Value());
-                }
+                                              dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->Value(),
+                                              (ElemType)(1.0 - m_adaptationRegWeight),
+                                              dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[0])->Value());
+            }
 
-                // do forward and back propagation
+            // do forward and back propagation
 
-                // We optionally break the minibatch into sub-minibatches.
-                // This, when enabled, is used when a full minibatch does not fit into GPU RAM.
-                size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*trainSetDataReader, *net, *inputMatrices, numSubminibatchesNeeded);
-                for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++)
+            // We optionally break the minibatch into sub-minibatches.
+            // This, when enabled, is used when a full minibatch does not fit into GPU RAM.
+            size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*trainSetDataReader, *net, *inputMatrices, numSubminibatchesNeeded);
+            for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++)
+            {
+                if (actualNumSubminibatches > 1)
                 {
-                    if (actualNumSubminibatches > 1)
-                    {
                     smbDispatcher.GetSubMinibatchToNet(ismb); // get sub-minibatch from full-size one
-                        ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-                        ComputationNetwork::BumpEvalTimeStamp(labelNodes);
-                    }
+                    ComputationNetwork::BumpEvalTimeStamp(featureNodes);
+                    ComputationNetwork::BumpEvalTimeStamp(labelNodes);
+                }
 
-                    // ===========================================================
-                    // forward prop for evaluate eval nodes
-                    // ===========================================================
+                // ===========================================================
+                // forward prop for evaluate eval nodes
+                // ===========================================================
 
-                    // compute eval node first since when gradient is computed the forward function values
-                    // may be changed and need to be recomputed when gradient and function value share the same matrix
+                // compute eval node first since when gradient is computed the forward function values
+                // may be changed and need to be recomputed when gradient and function value share the same matrix
                 net->ForwardProp(evaluationNodes); // the bulk of this evaluation is reused in ComputeGradient() below
 
-                    // ===========================================================
-                    // forward prop for training criterion
-                    // ===========================================================
+                // ===========================================================
+                // forward prop for training criterion
+                // ===========================================================
 
-                    net->ForwardProp(criterionNodes[0]);
+                net->ForwardProp(criterionNodes[0]);
 
-                    // ===========================================================
-                    // backprop
-                    // ===========================================================
+                // ===========================================================
+                // backprop
+                // ===========================================================
 
                 if (learnRatePerSample > 0.01 * m_minLearnRate) // only compute gradient when learning rate is large enough
-                        net->Backprop(criterionNodes[0]);
+                    net->Backprop(criterionNodes[0]);
 
-                    // house-keeping for sub-minibatching
-                    if (actualNumSubminibatches > 1)
+                // house-keeping for sub-minibatching
+                if (actualNumSubminibatches > 1)
                     smbDispatcher.DoneWithCurrentSubMinibatch(ismb); // page state out
             }                                                        // end sub-minibatch loop
-                if (actualNumSubminibatches > 1)
-                    smbDispatcher.DoneWithCurrentMinibatch(); 
-            } // if (actualMBSize > 0)
+            if (actualNumSubminibatches > 1)
+                smbDispatcher.DoneWithCurrentMinibatch();
+        } // if (actualMBSize > 0)
 
-            // for progress and statistics, we should only count frames that are not gaps
-            size_t numSamplesWithLabel = wasDataRead ? net->GetNumSamplesWithLabel(actualMBSize) : 0;
+        // for progress and statistics, we should only count frames that are not gaps
+        size_t numSamplesWithLabel = wasDataRead ? net->GetNumSamplesWithLabel(actualMBSize) : 0;
 
-            // Sum of actualMBSize across all nodes when using parallel training
-            size_t aggregateNumSamples = actualMBSize;
-            size_t aggregateNumSamplesWithLabel = numSamplesWithLabel;
+        // Sum of actualMBSize across all nodes when using parallel training
+        size_t aggregateNumSamples = actualMBSize;
+        size_t aggregateNumSamplesWithLabel = numSamplesWithLabel;
 
-            if (!useGradientAggregation)
+        if (!useGradientAggregation)
+        {
+            // accumulate criterion values (objective, eval)
+            if (actualMBSize != 0)
             {
-                // accumulate criterion values (objective, eval)
-                if (actualMBSize != 0)
+                assert(wasDataRead);
+                // criteria are in Value()(0,0), we accumulate into another 1x1 Matrix (to avoid having to pull the values off the GPU)
+                Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->Value(),
+                                                      0, 0, localEpochCriterion, 0, 0);
+                for (size_t i = 0; i < evaluationNodes.size(); i++)
                 {
-                    assert(wasDataRead);
-                    // criteria are in Value()(0,0), we accumulate into another 1x1 Matrix (to avoid having to pull the values off the GPU)
-                    Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->Value(),
-                                                          0, 0, localEpochCriterion, 0, 0);
-                    for (size_t i = 0; i < evaluationNodes.size(); i++)
-                    {
-                        Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->Value(),
-                                                              0, 0, localEpochEvalErrors, 0, i);
-                    }
+                    Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->Value(),
+                                                          0, 0, localEpochEvalErrors, 0, i);
                 }
             }
-            else
+        }
+        else
+        {
+            // distributed gradient aggregation
+            if (learnParamsGradients.size() == 0)
             {
-                // distributed gradient aggregation
-                if (learnParamsGradients.size() == 0)
+                learnParamsGradients.reserve(learnableNodes.size());
+                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
                 {
-                    learnParamsGradients.reserve(learnableNodes.size());
-                    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+                    ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+                    if (node->IsParameterUpdateRequired())
                     {
-                        ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-                        if (node->IsParameterUpdateRequired())
-                        {
-                            Matrix<ElemType>* currParamsGradient = &(node->Gradient());
+                        Matrix<ElemType>* currParamsGradient = &(node->Gradient());
 
-                            // Sometimes, in parallel training, the current node may not get any samples to process
-                            // In this case, the gradient matrix may not have been sized yet. If so, lets size it.
-                            if (currParamsGradient->GetNumCols() == 0)
-                            {
-                                Matrix<ElemType>* currParamsValues = &(node->Value());
-                                currParamsGradient->Resize(currParamsValues->GetNumRows(), currParamsValues->GetNumCols());
-                            }
-
-                            learnParamsGradients.push_back(currParamsGradient);
+                        // Sometimes, in parallel training, the current node may not get any samples to process
+                        // In this case, the gradient matrix may not have been sized yet. If so, lets size it.
+                        if (currParamsGradient->GetNumCols() == 0)
+                        {
+                            Matrix<ElemType>* currParamsValues = &(node->Value());
+                            currParamsGradient->Resize(currParamsValues->GetNumRows(), currParamsValues->GetNumCols());
                         }
+
+                        learnParamsGradients.push_back(currParamsGradient);
                     }
                 }
+            }
 
-                // prepare the header
-                m_gradHeader->numEvalNode = evaluationNodes.size();
-                m_gradHeader->numSamples = actualMBSize;
-                m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
-                m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0;
-                for (size_t i = 0; i < evaluationNodes.size(); i++)
-                    m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evaluationNodes[i]->Get00Element() : 0.0;
+            // prepare the header
+            m_gradHeader->numEvalNode = evaluationNodes.size();
+            m_gradHeader->numSamples = actualMBSize;
+            m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
+            m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0;
+            for (size_t i = 0; i < evaluationNodes.size(); i++)
+                m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evaluationNodes[i]->Get00Element() : 0.0;
 
-                bool samplesProcessed = m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, epochNumber);
-                noMoreSamplesToProcess = !samplesProcessed;
+            bool samplesProcessed = m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, epochNumber);
+            noMoreSamplesToProcess = !samplesProcessed;
 
-                aggregateNumSamples = m_gradHeader->numSamples;
-                aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
-                epochCriterion += m_gradHeader->criterion;
+            aggregateNumSamples = m_gradHeader->numSamples;
+            aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
+            epochCriterion += m_gradHeader->criterion;
             for (size_t i = 0; i < epochEvalErrors.size(); i++)
-                    epochEvalErrors[i] += m_gradHeader->evalErrors[i];
-            }
+                epochEvalErrors[i] += m_gradHeader->evalErrors[i];
+        }
 
-            // update model parameters
-            if ((aggregateNumSamples > 0) && (learnRatePerSample > m_minLearnRate * 0.01))
+        // update model parameters
+        if ((aggregateNumSamples > 0) && (learnRatePerSample > m_minLearnRate * 0.01))
+        {
+            auto smoothedGradientIter = smoothedGradients.begin();
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
             {
-                auto smoothedGradientIter = smoothedGradients.begin();
-                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
+                ComputationNodeBasePtr node = *nodeIter;
+                if (node->IsParameterUpdateRequired())
                 {
-                    ComputationNodeBasePtr node = *nodeIter;
-                    if (node->IsParameterUpdateRequired())
-                    {
-                        Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+                    Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
 #ifdef _DEBUG
-                        if (smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): "))
-                            LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
+                    if (smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): "))
+                        LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
-                        UpdateWeights(node, smoothedGradient, learnRatePerSample,
+                    UpdateWeights(node, smoothedGradient, learnRatePerSample,
                                   GetMomentumPerSample(epochNumber /*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples,
-                                      m_L2RegWeight, m_L1RegWeight,
-                                      m_needAveMultiplier, m_useNesterovMomentum);
+                                  m_L2RegWeight, m_L1RegWeight,
+                                  m_needAveMultiplier, m_useNesterovMomentum);
 #ifdef _DEBUG
-                        if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
-                            LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
+                    if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
+                        LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
-                    }
                 }
             }
+        }
 
-            // aggregation by model averaging
-            // TODO: this does not happen each MB, does it?
-            if (useModelAveraging)
+        // aggregation by model averaging
+        // TODO: this does not happen each MB, does it?
+        if (useModelAveraging)
+        {
+            // Determine if any samples were processed across any of the ranks
+            if (useDistributedMBReading)
             {
-                // Determine if any samples were processed across any of the ranks
-                if (useDistributedMBReading)
-                {
-                    std::array<int, 1> numNodesWithDataToProcess;
-                    numNodesWithDataToProcess[0] = wasDataRead ? 1 : 0;
-                    g_mpi->AllReduce(numNodesWithDataToProcess);
+                std::array<int, 1> numNodesWithDataToProcess;
+                numNodesWithDataToProcess[0] = wasDataRead ? 1 : 0;
+                g_mpi->AllReduce(numNodesWithDataToProcess);
 
-                    if (numNodesWithDataToProcess[0] == 0)
-                        noMoreSamplesToProcess = true;
-                }
+                if (numNodesWithDataToProcess[0] == 0)
+                    noMoreSamplesToProcess = true;
+            }
 
-                if (g_mpi->NumNodesInUse() > 1)
+            if (g_mpi->NumNodesInUse() > 1)
+            {
+                size_t processedSamples = 0;
+                float secondsSinceLastSyncFinished = 0;
+                float secondsSpentOnSync = 0;
+                if (ModelAveragingProcessing(nSamplesSinceLastModelSync, learnableNodes, processedSamples,
+                                             secondsSinceLastSyncFinished, secondsSpentOnSync))
                 {
-                    size_t processedSamples = 0; 
-                    float secondsSinceLastSyncFinished = 0; 
-                    float secondsSpentOnSync = 0;
-                    if (ModelAveragingProcessing(nSamplesSinceLastModelSync, learnableNodes, processedSamples,
-                                                 secondsSinceLastSyncFinished, secondsSpentOnSync))
+                    // if a sync happens, do some extra work
+                    nSamplesSinceLastModelSync = 0;
+                    nSynced++;
+
+                    nSecondsOnMASync += secondsSpentOnSync;
+                    nSecondsSinceLastMAPerfReport += secondsSinceLastSyncFinished;
+
+                    if (m_syncStatsTrace > 0)
                     {
-                        // if a sync happens, do some extra work
-                        nSamplesSinceLastModelSync = 0; 
-                        nSynced++;
-
-                        nSecondsOnMASync += secondsSpentOnSync; 
-                        nSecondsSinceLastMAPerfReport += secondsSinceLastSyncFinished; 
-                    
-                        if (m_syncStatsTrace > 0)
+                        if (nSynced % m_syncStatsTrace == 0)
                         {
-                            if (nSynced % m_syncStatsTrace == 0)
-                            {
-                                fprintf(stderr, "\t\t-----(model averaging stats) %d-th sync, %8.2f seconds since last report, %5.2f seconds on communication\n",
+                            fprintf(stderr, "\t\t-----(model averaging stats) %d-th sync, %8.2f seconds since last report, %5.2f seconds on communication\n",
                                     (int) nSynced, nSecondsSinceLastMAPerfReport, nSecondsOnMASync);
-                                nSecondsOnMASync = 0; 
-                                nSecondsSinceLastMAPerfReport = 0; 
-                            }
+                            nSecondsOnMASync = 0;
+                            nSecondsSinceLastMAPerfReport = 0;
                         }
                     }
-                    aggregateNumSamplesWithLabel = processedSamples;
                 }
+                aggregateNumSamplesWithLabel = processedSamples;
             }
+        }
 
-            timer.Stop();
-            numMBsRun++;
+        timer.Stop();
+        numMBsRun++;
 
-            totalTimeInMBs += timer.ElapsedSeconds();
-            numSamplesLastMBs += useModelAveraging ? int(actualMBSize) : int(aggregateNumSamplesWithLabel);
+        totalTimeInMBs += timer.ElapsedSeconds();
+        numSamplesLastMBs += useModelAveraging ? int(actualMBSize) : int(aggregateNumSamplesWithLabel);
 
-            if (numMBsRun % m_numMBsToShowResult == 0)
+        if (numMBsRun % m_numMBsToShowResult == 0)
+        {
+            // get the epoch Values updated
+            if (!useGradientAggregation)
             {
-                // get the epoch Values updated
-                if (!useGradientAggregation)
+                timer.Restart();
+                epochCriterion = localEpochCriterion.Get00Element();
+                for (size_t i = 0; i < epochEvalErrors.size(); i++)
                 {
-                    timer.Restart();
-                    epochCriterion = localEpochCriterion.Get00Element();
-                    for (size_t i = 0; i < epochEvalErrors.size(); i++)
-                    {
-                        epochEvalErrors[i] = localEpochEvalErrors(0, i);
-                    }
-                    timer.Stop();
-
-                    // Add the last trailing compute
-                    totalTimeInMBs += timer.ElapsedSeconds();
+                    epochEvalErrors[i] = localEpochEvalErrors(0, i);
                 }
+                timer.Stop();
+
+                // Add the last trailing compute
+                totalTimeInMBs += timer.ElapsedSeconds();
+            }
 
-                double trainLossPerSample = (numSamplesLastMBs != 0) ? ((epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs) : 0.0;
-                bool wasProgressPrinted = false;
+            double trainLossPerSample = (numSamplesLastMBs != 0) ? ((epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs) : 0.0;
+            bool wasProgressPrinted = false;
 
-                if (epochNumber > 0 || (int) epochSize > 0)
+            if (epochNumber > 0 || (int) epochSize > 0)
+            {
+                // progress tracing for compute cluster management
+                double mbProg = 0.0;
+                int mbProgNumPrecision = 2;
+                if (m_maxComputedEpochSize != 0)
                 {
-                    // progress tracing for compute cluster management
-                    double mbProg = 0.0;
-                    int mbProgNumPrecision = 2;
-                    if (m_maxComputedEpochSize != 0)
-                    {
                     double numMBPerEpoch = (double) m_maxComputedEpochSize / (double) tunedMBSize;
                     mbProg = (double) numMBsRun / numMBPerEpoch;
                     mbProgNumPrecision = (int) ceil(log10(numMBPerEpoch / (double) m_numMBsToShowResult));
-                        mbProgNumPrecision = max(mbProgNumPrecision - 2, 2);
-                    }
-                    wasProgressPrinted = ProgressTracing::TraceProgressPercentage(epochNumber, mbProg, false);
-
-                    // progress tracing for regular log
-                    string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d, %2." + std::to_string(mbProgNumPrecision) + "f%%]: SamplesSeen = %d; TrainLossPerSample = " +
-                                          GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
-                    SGDTrace(stderr, formatString.c_str(),
-                             prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
-                             numMBsRun, mbProg * 100, numSamplesLastMBs, trainLossPerSample);
-                }
-                else
-                {
-                    string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d]: SamplesSeen = %d; TrainLossPerSample = " +
-                                          GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
-                    SGDTrace(stderr, formatString.c_str(),
-                             prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
-                             numMBsRun, numSamplesLastMBs, trainLossPerSample);
-                    m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult;
-                }
-
-                double evalError = 0.0;
-                for (size_t i = 0; i < epochEvalErrors.size(); i++)
-                {
-                    evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
-                    string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
-                    SGDTrace(stderr, formatString.c_str(), i, evalError);
+                    mbProgNumPrecision = max(mbProgNumPrecision - 2, 2);
                 }
+                wasProgressPrinted = ProgressTracing::TraceProgressPercentage(epochNumber, mbProg, false);
+
+                // progress tracing for regular log
+                string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d, %2." + std::to_string(mbProgNumPrecision) + "f%%]: SamplesSeen = %d; TrainLossPerSample = " +
+                                      GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
+                SGDTrace(stderr, formatString.c_str(),
+                         prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
+                         numMBsRun, mbProg * 100, numSamplesLastMBs, trainLossPerSample);
+            }
+            else
+            {
+                string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d]: SamplesSeen = %d; TrainLossPerSample = " +
+                                      GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
+                SGDTrace(stderr, formatString.c_str(),
+                         prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
+                         numMBsRun, numSamplesLastMBs, trainLossPerSample);
+                m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult;
+            }
 
-                string formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 4, totalTimeInMBs) + "s; SamplesPerSecond = %.1f\n";
-                SGDTrace(stderr, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
+            double evalError = 0.0;
+            for (size_t i = 0; i < epochEvalErrors.size(); i++)
+            {
+                evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
+                string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
+                SGDTrace(stderr, formatString.c_str(), i, evalError);
+            }
 
-                // progress tracing for compute cluster management
-                if (wasProgressPrinted)
-                {
-                    ProgressTracing::TraceTrainLoss(trainLossPerSample);
-                }
+            string formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 4, totalTimeInMBs) + "s; SamplesPerSecond = %.1f\n";
+            SGDTrace(stderr, formatString.c_str(), totalTimeInMBs, numSamplesLastMBs / totalTimeInMBs);
 
-                if (m_traceLevel > 0)
-                {
-                    fflush(stderr);
-                }
+            // progress tracing for compute cluster management
+            if (wasProgressPrinted)
+            {
+                ProgressTracing::TraceTrainLoss(trainLossPerSample);
+            }
 
-                // reset statistics
-                totalTimeInMBs = 0;
-                numSamplesLastMBs = 0;
+            if (m_traceLevel > 0)
+            {
+                fflush(stderr);
+            }
 
-                epochCriterionLastMBs = epochCriterion;
-                for (size_t i = 0; i < epochEvalErrorsLastMBs.size(); i++)
-                {
-                    epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
-                }
+            // reset statistics
+            totalTimeInMBs = 0;
+            numSamplesLastMBs = 0;
 
-                if (std::isnan(epochCriterion))
-                {
-                    RuntimeError("The training criterion is not a number (NAN). Stop\n");
-                }
+            epochCriterionLastMBs = epochCriterion;
+            for (size_t i = 0; i < epochEvalErrorsLastMBs.size(); i++)
+            {
+                epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
+            }
+
+            if (std::isnan(epochCriterion))
+            {
+                RuntimeError("The training criterion is not a number (NAN). Stop\n");
             }
+        }
 
-            timer.Restart();
-            totalEpochSamples += aggregateNumSamplesWithLabel;
-            totalSamplesSeen += aggregateNumSamplesWithLabel;
+        timer.Restart();
+        totalEpochSamples += aggregateNumSamplesWithLabel;
+        totalSamplesSeen += aggregateNumSamplesWithLabel;
 
-            // call DataEnd function
-            // This signals something from SGD to the reader.
-            // DataEnd does reader specific process if sentence ending is reached
-            trainSetDataReader->DataEnd(EndDataType::endDataSentence);
+        // call DataEnd function
+        // This signals something from SGD to the reader.
+        // DataEnd does reader specific process if sentence ending is reached
+        trainSetDataReader->DataEnd(EndDataType::endDataSentence);
 
-            // Attempts to compute the error signal for the whole utterance, which will
-            // be fed to the neural network as features. Currently it is a workaround
-            // for the two-forward-pass sequence and ctc training, which allows
-            // processing more utterances at the same time. Only used in Kaldi2Reader.
-            // TODO: move the two-forward-pass support out of the reader.
-            AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
+        // Attempts to compute the error signal for the whole utterance, which will
+        // be fed to the neural network as features. Currently it is a workaround
+        // for the two-forward-pass sequence and ctc training, which allows
+        // processing more utterances at the same time. Only used in Kaldi2Reader.
+        // TODO: move the two-forward-pass support out of the reader.
+        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
 
-            profiler.NextSample();
-        }
+        profiler.NextSample();
+    }
 
-        // --- END MAIN MINIBATCH LOOP
+    // --- END MAIN MINIBATCH LOOP
 
     if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
-        {
-            // may not be synced after epoch finished, so do the sync here 
+    {
+        // may not be synced after epoch finished, so do the sync here
         int residualSampels = (int) nSamplesSinceLastModelSync;
-            g_mpi->AllReduce(&residualSampels, 1);
-            totalSamplesSeen += residualSampels; 
-            totalEpochSamples += residualSampels;
-            ModelAveragingSync(nSamplesSinceLastModelSync, learnableNodes);
-            nSynced++;
-            nSamplesSinceLastModelSync = 0;
-        }
+        g_mpi->AllReduce(&residualSampels, 1);
+        totalSamplesSeen += residualSampels;
+        totalEpochSamples += residualSampels;
+        ModelAveragingSync(nSamplesSinceLastModelSync, learnableNodes);
+        nSynced++;
+        nSamplesSinceLastModelSync = 0;
+    }
 
-        // compute final criterion values
-        if (useGradientAggregation)
-        {
-            // with parallelization, we have them in regular variables
-            epochCriterion /= float(totalEpochSamples);
+    // compute final criterion values
+    if (useGradientAggregation)
+    {
+        // with parallelization, we have them in regular variables
+        epochCriterion /= float(totalEpochSamples);
         for (size_t i = 0; i < epochEvalErrors.size(); i++)
-            {
-                epochEvalErrors[i] /= totalEpochSamples;
-            }
-        }
-        else
         {
-            // without, we have them in Matrix objects that possibly live on the GPU--get them over now
-            localEpochCriterion /= float(totalEpochSamples);
-            localEpochEvalErrors /= float(totalEpochSamples);
-
-            epochCriterion = localEpochCriterion.Get00Element();
-            for (size_t i = 0; i < epochEvalErrors.size(); i++)
-            {
-                epochEvalErrors[i] = localEpochEvalErrors(0, i);
-            }
+            epochEvalErrors[i] /= totalEpochSamples;
         }
+    }
+    else
+    {
+        // without, we have them in Matrix objects that possibly live on the GPU--get them over now
+        localEpochCriterion /= float(totalEpochSamples);
+        localEpochEvalErrors /= float(totalEpochSamples);
 
-        // in case of model averaging, do one more final aggregation of criteria
-        if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
+        epochCriterion = localEpochCriterion.Get00Element();
+        for (size_t i = 0; i < epochEvalErrors.size(); i++)
         {
-            // merge epochCriterion and epochEvalErrors over nodes 
-            g_mpi->AllReduce(&epochCriterion, 1);
-            g_mpi->AllReduce(epochEvalErrors);
+            epochEvalErrors[i] = localEpochEvalErrors(0, i);
         }
-        return totalEpochSamples;
     }
 
-    // -----------------------------------------------------------------------
-    // sub-routines and helpers follow below
-    // -----------------------------------------------------------------------
+    // in case of model averaging, do one more final aggregation of criteria
+    if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
+    {
+        // merge epochCriterion and epochEvalErrors over nodes
+        g_mpi->AllReduce(&epochCriterion, 1);
+        g_mpi->AllReduce(epochEvalErrors);
+    }
+    return totalEpochSamples;
+}
+
+// -----------------------------------------------------------------------
+// sub-routines and helpers follow below
+// -----------------------------------------------------------------------
 
 #if 0
     // TODO: per discussion with Dong Yu, Guoguo Chen, and Yu Zhang, this function can be removed.
@@ -1309,1242 +1309,1242 @@ template <class ElemType>
     }
 #endif
 
-    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
-    {
-        return pow(momentumPerSample, minibatchSize);
-    }
+static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+{
+    return pow(momentumPerSample, minibatchSize);
+}
 
-    // Get{Train,Eval}CriterionNodes() return a reference that is, unfortunately, dependent on the network.
-    // So we hold those inside here. Not very nice. Also not thread-safe. This may go away once we fix sequence-to-sequence models properly.
-    // TODO: merge them into one.
-    static map<ComputationNetworkPtr, vector<ComputationNodeBasePtr>> tmpCriterionNodeSets;
-    // TODO: test this, then remove this comment
+// Get{Train,Eval}CriterionNodes() return a reference that is, unfortunately, dependent on the network.
+// So we hold those inside here. Not very nice. Also not thread-safe. This may go away once we fix sequence-to-sequence models properly.
+// TODO: merge them into one.
+static map<ComputationNetworkPtr, vector<ComputationNodeBasePtr>> tmpCriterionNodeSets;
+// TODO: test this, then remove this comment
 
 template <class ElemType>
 std::vector<ComputationNodeBasePtr>& SGD<ElemType>::GetTrainCriterionNodes(ComputationNetworkPtr net)
+{
+    if (!m_trainCriterionNodeName.empty())
     {
-        if (!m_trainCriterionNodeName.empty())
-        {
-            tmpCriterionNodeSets[net] = net->CriterionNodesFrom(m_trainCriterionNodeName);
-            return tmpCriterionNodeSets[net];
-        }
-        else
-            return net->FinalCriterionNodes();
+        tmpCriterionNodeSets[net] = net->CriterionNodesFrom(m_trainCriterionNodeName);
+        return tmpCriterionNodeSets[net];
     }
+    else
+        return net->FinalCriterionNodes();
+}
 
 template <class ElemType>
 std::vector<ComputationNodeBasePtr>& SGD<ElemType>::GetEvalCriterionNodes(ComputationNetworkPtr net)
+{
+    if (!m_evalCriterionNodeName.empty())
     {
-        if (!m_evalCriterionNodeName.empty())
-        {
-            tmpCriterionNodeSets[net] = net->CriterionNodesFrom(m_evalCriterionNodeName);
-            return tmpCriterionNodeSets[net];
-        }
-        else
-            return net->EvaluationNodes();
+        tmpCriterionNodeSets[net] = net->CriterionNodesFrom(m_evalCriterionNodeName);
+        return tmpCriterionNodeSets[net];
     }
+    else
+        return net->EvaluationNodes();
+}
 
-    // return true if precomputation is executed.
+// return true if precomputation is executed.
 template <class ElemType>
-    bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
-                    IDataReader<ElemType>* trainSetDataReader,
+bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
+                               IDataReader<ElemType>* trainSetDataReader,
                                std::vector<ComputationNodeBasePtr>& featureNodes,
                                std::vector<ComputationNodeBasePtr>& labelNodes,
-                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
-    {
+                               std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
+{
     std::list<ComputationNodeBasePtr> nodes = net->GetNodesRequiringPreComputation(); // this tests all HasComputed() flags
 
-        if (nodes.size() == 0)
-        {
-            fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
-            return false;
-        }
+    if (nodes.size() == 0)
+    {
+        fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
+        return false;
+    }
 
-        fprintf(stderr, "\nPrecomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        {
-            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
-            fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
-        }
+    fprintf(stderr, "\nPrecomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
+    for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+    {
+        auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
+        fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
+    }
 
-        //compute
-        //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
-        // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
-        // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's,
-        // so we need to use all the data to do precomputing
+    //compute
+    //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
+    // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
+    // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's,
+    // so we need to use all the data to do precomputing
     if (m_useAllDataForPreComputedNode) // using all the data
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
+        trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
     else // using only one epoch
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
-        net->StartEvaluateMinibatchLoop(nodes);
+        trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
+    net->StartEvaluateMinibatchLoop(nodes);
 
-        // initialize
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        {
-            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
+    // initialize
+    for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+    {
+        auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
         node->MarkComputed(false /*begin accumulating*/);
-        }
+    }
 
-        const size_t numIterationsBeforePrintingProgress = 100;
-        size_t numItersSinceLastPrintOfProgress = 0;
-        size_t actualMBSizeDummy;
-        while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSizeDummy))
-        {
-            // TODO: move these into GetMinibatchIntoNetwork()  --but those are passed around; necessary? Can't we get them from 'net'?
-            ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-            ComputationNetwork::BumpEvalTimeStamp(labelNodes);
+    const size_t numIterationsBeforePrintingProgress = 100;
+    size_t numItersSinceLastPrintOfProgress = 0;
+    size_t actualMBSizeDummy;
+    while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSizeDummy))
+    {
+        // TODO: move these into GetMinibatchIntoNetwork()  --but those are passed around; necessary? Can't we get them from 'net'?
+        ComputationNetwork::BumpEvalTimeStamp(featureNodes);
+        ComputationNetwork::BumpEvalTimeStamp(labelNodes);
 
-            net->ForwardProp(nodes);
+        net->ForwardProp(nodes);
 
-            if (ProgressTracing::IsEnabled())
+        if (ProgressTracing::IsEnabled())
+        {
+            numItersSinceLastPrintOfProgress++;
+            if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
             {
-                numItersSinceLastPrintOfProgress++;
-                if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
-                {
-                    // TODO: For now just print 0.0 instead of calculating actual progress
-                    printf("PROGRESS: %.2f%%\n", 0.0f);
-                    numItersSinceLastPrintOfProgress = 0;
-                }
+                // TODO: For now just print 0.0 instead of calculating actual progress
+                printf("PROGRESS: %.2f%%\n", 0.0f);
+                numItersSinceLastPrintOfProgress = 0;
             }
         }
+    }
 
-        // finalize
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        {
-            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
+    // finalize
+    for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+    {
+        auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
         node->MarkComputed(true /*done accumulating*/);
-        }
-        fprintf(stderr, "\nPrecomputing --> Completed.\n\n");
-
-        return true;
     }
+    fprintf(stderr, "\nPrecomputing --> Completed.\n\n");
 
-    // return a reasonable initial learning rate based on the initial mbsize
+    return true;
+}
+
+// return a reasonable initial learning rate based on the initial mbsize
 template <class ElemType>
-    double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
-                                                 ComputationNetworkPtr refNet,
-                                                 const ComputationNodeBasePtr& refNode, const int epochNumber,
-                                                 const double curLearnRate,
-                                                 IDataReader<ElemType>* trainSetDataReader,
+double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
+                                             ComputationNetworkPtr refNet,
+                                             const ComputationNodeBasePtr& refNode, const int epochNumber,
+                                             const double curLearnRate,
+                                             IDataReader<ElemType>* trainSetDataReader,
                                              const std::vector<ComputationNodeBasePtr>& featureNodes,
                                              const std::vector<ComputationNodeBasePtr>& labelNodes,
                                              const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                              const std::vector<ComputationNodeBasePtr>& evaluationNodes,
-                                                 std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                             std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
                                              const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                 std::list<Matrix<ElemType>>& smoothedGradients,
-                                                 const bool learnRateInitialized,
-                                                 const double largestPrevLearnRatePerSample)
-    {
-        double epochCriterion = std::numeric_limits<double>::infinity();
-        double prevCriterion = std::numeric_limits<double>::infinity();
-        vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+                                             std::list<Matrix<ElemType>>& smoothedGradients,
+                                             const bool learnRateInitialized,
+                                             const double largestPrevLearnRatePerSample)
+{
+    double epochCriterion = std::numeric_limits<double>::infinity();
+    double prevCriterion = std::numeric_limits<double>::infinity();
+    vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
 
-        size_t totalSamplesSeen = 0;
-        double bestLearnRatePerSample = curLearnRate;
+    size_t totalSamplesSeen = 0;
+    double bestLearnRatePerSample = curLearnRate;
 
-        size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
-        if (m_epochSize != requestDataSize)
-        {
-            // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
-            numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
-        }
+    size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
+    if (m_epochSize != requestDataSize)
+    {
+        // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+        numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
+    }
 
-        double baseCriterion;
+    double baseCriterion;
 
-        double minLearnRate = m_minLearnRate * 0.3f;
+    double minLearnRate = m_minLearnRate * 0.3f;
     double learnRatePerSample = 1.0f / 8.0f / 0.618f / sqrt((double) m_mbSize[epochNumber]);
 
-        if (learnRateInitialized && largestPrevLearnRatePerSample > 0)
-        {
-            // largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
-            learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;
-        }
-
-        int baseModelEpoch = epochNumber - 1;
-        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
-
-        double learnRate = learnRatePerSample;
-        size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch,
-                           /*out*/ totalSamplesSeen,
-                           /*out*/ learnRate,
-                           smoothedGradients,
-                           /*out*/ prevCriterion,
-                           /*out*/ dummyMinibatchSize);
-
-        // if model is not changed this is what we will get
-        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                        numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber],
-                                        featureNodes, labelNodes,
-                                        criterionNodes, evaluationNodes,
-                                        inputMatrices, learnableNodes,
-                                        smoothedGradients, /*out*/ baseCriterion,
-                                        /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                                        "BaseAdaptiveLearnRateSearch:");
+    if (learnRateInitialized && largestPrevLearnRatePerSample > 0)
+    {
+        // largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
+        learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;
+    }
 
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
-        {
-            if (prevCriterion == std::numeric_limits<double>::infinity())
-                prevCriterion = baseCriterion;
+    int baseModelEpoch = epochNumber - 1;
+    net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+
+    double learnRate = learnRatePerSample;
+    size_t dummyMinibatchSize = 0;
+    LoadCheckPointInfo(baseModelEpoch,
+                       /*out*/ totalSamplesSeen,
+                       /*out*/ learnRate,
+                       smoothedGradients,
+                       /*out*/ prevCriterion,
+                       /*out*/ dummyMinibatchSize);
+
+    // if model is not changed this is what we will get
+    TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                    numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber],
+                                    featureNodes, labelNodes,
+                                    criterionNodes, evaluationNodes,
+                                    inputMatrices, learnableNodes,
+                                    smoothedGradients, /*out*/ baseCriterion,
+                                    /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                    "BaseAdaptiveLearnRateSearch:");
+
+    if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+    {
+        if (prevCriterion == std::numeric_limits<double>::infinity())
+            prevCriterion = baseCriterion;
 
-            double ratio = 0.3;
+        double ratio = 0.3;
 
-            if (m_epochSize != requestDataSize)
+        if (m_epochSize != requestDataSize)
             ratio = pow(((double) numFramesToUseInSearch) / m_epochSize, 1.0f / 2);
 
-            baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
-        }
+        baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
+    }
 
-        do
-        {
-            learnRatePerSample *= 0.618;
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            learnRatePerSample, m_mbSize[epochNumber], featureNodes,
-                                            labelNodes, criterionNodes,
-                                            evaluationNodes, inputMatrices,
-                                            learnableNodes, smoothedGradients,
-                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
-                                            /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
+    do
+    {
+        learnRatePerSample *= 0.618;
+        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                        numFramesToUseInSearch, trainSetDataReader,
+                                        learnRatePerSample, m_mbSize[epochNumber], featureNodes,
+                                        labelNodes, criterionNodes,
+                                        evaluationNodes, inputMatrices,
+                                        learnableNodes, smoothedGradients,
+                                        /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                        /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
 
-        } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
+    } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
 
-        bestLearnRatePerSample = learnRatePerSample;
+    bestLearnRatePerSample = learnRatePerSample;
 
-        //grid search for the first m_numBestSearchEpoch  epochs
-        if (epochNumber < m_numBestSearchEpoch)
-        {
-            double leftLearnRatePerSample = 0.01 / m_mbSize[epochNumber];
-            double rightLearnRatePerSample = learnRatePerSample;
-            double leftCriterion, rightCriterion = epochCriterion;
+    //grid search for the first m_numBestSearchEpoch  epochs
+    if (epochNumber < m_numBestSearchEpoch)
+    {
+        double leftLearnRatePerSample = 0.01 / m_mbSize[epochNumber];
+        double rightLearnRatePerSample = learnRatePerSample;
+        double leftCriterion, rightCriterion = epochCriterion;
 
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            leftLearnRatePerSample, m_mbSize[epochNumber],
-                                            featureNodes, labelNodes,
-                                            criterionNodes, evaluationNodes,
-                                            inputMatrices, learnableNodes,
-                                            smoothedGradients, /*out*/ leftCriterion,
-                                            /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                                            "DetailBaseAdaptiveLearnRateSearch:");
+        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                        numFramesToUseInSearch, trainSetDataReader,
+                                        leftLearnRatePerSample, m_mbSize[epochNumber],
+                                        featureNodes, labelNodes,
+                                        criterionNodes, evaluationNodes,
+                                        inputMatrices, learnableNodes,
+                                        smoothedGradients, /*out*/ leftCriterion,
+                                        /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                        "DetailBaseAdaptiveLearnRateSearch:");
 
-            while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2)
+        while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2)
+        {
+            if (rightCriterion > leftCriterion)
             {
-                if (rightCriterion > leftCriterion)
-                {
-                    rightLearnRatePerSample *= 0.618;
-
-                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, numFramesToUseInSearch,
-                                                    trainSetDataReader,
-                                                    rightLearnRatePerSample, m_mbSize[epochNumber],
-                                                    featureNodes, labelNodes,
-                                                    criterionNodes,
-                                                    evaluationNodes,
-                                                    inputMatrices,
-                                                    learnableNodes,
-                                                    smoothedGradients,
-                                                    /*out*/ rightCriterion,
-                                                    /*out*/ epochEvalErrors,
-                                                    /*out*/ totalSamplesSeen,
-                                                    "DetailRightAdaptiveLearnRateSearch:");
-                }
-                else
-                {
-                    leftLearnRatePerSample /= 0.618;
-
-                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, numFramesToUseInSearch,
-                                                    trainSetDataReader,
-                                                    leftLearnRatePerSample, m_mbSize[epochNumber],
-                                                    featureNodes, labelNodes,
-                                                    criterionNodes,
-                                                    evaluationNodes,
-                                                    inputMatrices,
-                                                    learnableNodes,
-                                                    smoothedGradients,
-                                                    /*out*/ leftCriterion,
-                                                    /*out*/ epochEvalErrors,
-                                                    /*out*/ totalSamplesSeen,
-                                                    "DetailLeftAdaptiveLearnRateSearch:");
-                }
+                rightLearnRatePerSample *= 0.618;
+
+                TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
+                                                epochNumber, numFramesToUseInSearch,
+                                                trainSetDataReader,
+                                                rightLearnRatePerSample, m_mbSize[epochNumber],
+                                                featureNodes, labelNodes,
+                                                criterionNodes,
+                                                evaluationNodes,
+                                                inputMatrices,
+                                                learnableNodes,
+                                                smoothedGradients,
+                                                /*out*/ rightCriterion,
+                                                /*out*/ epochEvalErrors,
+                                                /*out*/ totalSamplesSeen,
+                                                "DetailRightAdaptiveLearnRateSearch:");
             }
+            else
+            {
+                leftLearnRatePerSample /= 0.618;
+
+                TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
+                                                epochNumber, numFramesToUseInSearch,
+                                                trainSetDataReader,
+                                                leftLearnRatePerSample, m_mbSize[epochNumber],
+                                                featureNodes, labelNodes,
+                                                criterionNodes,
+                                                evaluationNodes,
+                                                inputMatrices,
+                                                learnableNodes,
+                                                smoothedGradients,
+                                                /*out*/ leftCriterion,
+                                                /*out*/ epochEvalErrors,
+                                                /*out*/ totalSamplesSeen,
+                                                "DetailLeftAdaptiveLearnRateSearch:");
+            }
+        }
 
         bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample : rightLearnRatePerSample;
-        }
+    }
 
-        fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n",
-                epochNumber + 1, bestLearnRatePerSample, baseCriterion);
+    fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n",
+            epochNumber + 1, bestLearnRatePerSample, baseCriterion);
 
-        return bestLearnRatePerSample;
-    }
+    return bestLearnRatePerSample;
+}
 
 template <class ElemType>
-    void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
-                                                        ComputationNetworkPtr refNet,
-                                                        const ComputationNodeBasePtr& refNode, const int epochNumber,
-                                                        const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
-                                                        const double learnRatePerSample,
-                                                        const size_t minibatchSize,
+void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
+                                                    ComputationNetworkPtr refNet,
+                                                    const ComputationNodeBasePtr& refNode, const int epochNumber,
+                                                    const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
+                                                    const double learnRatePerSample,
+                                                    const size_t minibatchSize,
                                                     const std::vector<ComputationNodeBasePtr>& featureNodes,
                                                     const std::vector<ComputationNodeBasePtr>& labelNodes,
                                                     const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                                     const std::vector<ComputationNodeBasePtr>& evaluationNodes,
-                                                        std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
                                                     const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                        std::list<Matrix<ElemType>>& smoothedGradients,
-                                                        /*out*/ double& epochCriterion,
-                                                        /*out*/ std::vector<double>& epochEvalErrors,
-                                                        /*out*/ size_t& totalSamplesSeen,
-                                                        std::string prefixMsg)
-    {
-        TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
-                      trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
-                      labelNodes, criterionNodes, evaluationNodes,
-                      inputMatrices, learnableNodes, smoothedGradients,
-                      /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                      prefixMsg);
-
-        fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
-
-        if (epochEvalErrors.size() == 1)
-            fprintf(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
-        else
+                                                    std::list<Matrix<ElemType>>& smoothedGradients,
+                                                    /*out*/ double& epochCriterion,
+                                                    /*out*/ std::vector<double>& epochEvalErrors,
+                                                    /*out*/ size_t& totalSamplesSeen,
+                                                    std::string prefixMsg)
+{
+    TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
+                  trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
+                  labelNodes, criterionNodes, evaluationNodes,
+                  inputMatrices, learnableNodes, smoothedGradients,
+                  /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                  prefixMsg);
+
+    fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
+
+    if (epochEvalErrors.size() == 1)
+        fprintf(stderr, "EvalErrPerSample = %.8g; AvgLearningRatePerSample = %.8g\n", epochEvalErrors[0], learnRatePerSample);
+    else
+    {
+        fprintf(stderr, "EvalErrPerSample ");
+        for (size_t i = 0; i < epochEvalErrors.size(); i++)
         {
-            fprintf(stderr, "EvalErrPerSample ");
-            for (size_t i = 0; i < epochEvalErrors.size(); i++)
-            {
-                fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
-            }
-            fprintf(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
+            fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
         }
-
-        int baseModelEpoch = epochNumber - 1;
-        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
-
-        double dummyLearnRate;
-        double dummtPrevCriterion;
-        size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch,
-                           /*out*/ totalSamplesSeen,
-                           /*out*/ dummyLearnRate,
-                           smoothedGradients,
-                           /*out*/ dummtPrevCriterion,
-                           /*out*/ dummyMinibatchSize);
+        fprintf(stderr, "AvgLearningRatePerSample = %.8g\n", learnRatePerSample);
     }
 
-    // AdaptiveMinibatchSizing() -- choose the largest feasible minibatch size
-    // This is necessary for data-parallel operation. The aim is to minimize model updates, and hence bandwidth
-    // This implements
-    //    F. Seide, H. Fu, J. Droppo, G. Li, and D. Yu:
-    //    "On Parallelizability of Stochastic Gradient Descent for Speech DNNs"
-    //    In Proc. ICASSP 2014.
+    int baseModelEpoch = epochNumber - 1;
+    net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+
+    double dummyLearnRate;
+    double dummtPrevCriterion;
+    size_t dummyMinibatchSize = 0;
+    LoadCheckPointInfo(baseModelEpoch,
+                       /*out*/ totalSamplesSeen,
+                       /*out*/ dummyLearnRate,
+                       smoothedGradients,
+                       /*out*/ dummtPrevCriterion,
+                       /*out*/ dummyMinibatchSize);
+}
+
+// AdaptiveMinibatchSizing() -- choose the largest feasible minibatch size
+// This is necessary for data-parallel operation. The aim is to minimize model updates, and hence bandwidth
+// This implements
+//    F. Seide, H. Fu, J. Droppo, G. Li, and D. Yu:
+//    "On Parallelizability of Stochastic Gradient Descent for Speech DNNs"
+//    In Proc. ICASSP 2014.
 template <class ElemType>
-    size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
-                                                  ComputationNetworkPtr refNet,
-                                                  const ComputationNodeBasePtr& refNode,
-                                                  const int epochNumber,
-                                                  const size_t numFramesToUseInSearch,
-                                                  IDataReader<ElemType>* trainSetDataReader,
-                                                  const double learnRatePerSample,
-                                                  const size_t initialMinibatchSize,
+size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
+                                              ComputationNetworkPtr refNet,
+                                              const ComputationNodeBasePtr& refNode,
+                                              const int epochNumber,
+                                              const size_t numFramesToUseInSearch,
+                                              IDataReader<ElemType>* trainSetDataReader,
+                                              const double learnRatePerSample,
+                                              const size_t initialMinibatchSize,
                                               const std::vector<ComputationNodeBasePtr>& featureNodes,
                                               const std::vector<ComputationNodeBasePtr>& labelNodes,
                                               const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                               const std::vector<ComputationNodeBasePtr>& evaluationNodes,
-                                                  std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                              std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
                                               const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                  std::list<Matrix<ElemType>>& smoothedGradients,
-                                                  const double learningRateAdjustmentFactor)
-    {
-        size_t minMinibatchSize = initialMinibatchSize;
-        size_t chosenMinibatchSize = initialMinibatchSize;
+                                              std::list<Matrix<ElemType>>& smoothedGradients,
+                                              const double learningRateAdjustmentFactor)
+{
+    size_t minMinibatchSize = initialMinibatchSize;
+    size_t chosenMinibatchSize = initialMinibatchSize;
 
-        // do some pre-adjustment based on LR
-        // Basically we assume that the LR for epoch 1 is safe for mbsize.
-        // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
+    // do some pre-adjustment based on LR
+    // Basically we assume that the LR for epoch 1 is safe for mbsize.
+    // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
     double learningRateChangeSoFar = GetLearningRatePerSample(epochNumber /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequences()) / GetLearningRatePerSample(0 /*BUGBUG workaround:*/, trainSetDataReader->GetNumParallelSequences());
-        learningRateChangeSoFar *= learningRateAdjustmentFactor;
+    learningRateChangeSoFar *= learningRateAdjustmentFactor;
 
-        // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
-        learningRateChangeSoFar = sqrt(learningRateChangeSoFar);
+    // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
+    learningRateChangeSoFar = sqrt(learningRateChangeSoFar);
 
-        // LR was indeed reduced
-        if (learningRateChangeSoFar < 1.0f)
-        {
-            // we can safely increase MB size (note: this may be bigger than our max)
-            minMinibatchSize = (size_t)(minMinibatchSize / learningRateChangeSoFar);
-        }
+    // LR was indeed reduced
+    if (learningRateChangeSoFar < 1.0f)
+    {
+        // we can safely increase MB size (note: this may be bigger than our max)
+        minMinibatchSize = (size_t)(minMinibatchSize / learningRateChangeSoFar);
+    }
 
-        if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
-        {
-            // newly started training: any previous MB size stored in the model is to be ignored
-            fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
+    if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
+    {
+        // newly started training: any previous MB size stored in the model is to be ignored
+        fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
                         "considered invalid -> resetting\n",
                 m_prevChosenMinibatchSize);
-            m_prevChosenMinibatchSize = 0;
-        }
+        m_prevChosenMinibatchSize = 0;
+    }
 
-        // check if we need to skip
-        if (m_prevChosenMinibatchSize != 0 &&
-            (epochNumber + 1) > m_minibatchSizeTuningFrequency &&
-            (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
+    // check if we need to skip
+    if (m_prevChosenMinibatchSize != 0 &&
+        (epochNumber + 1) > m_minibatchSizeTuningFrequency &&
+        (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
+    {
+        fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
+                        "in epoch %d skipped, keeping minibatchSize of %zd\n",
+                epochNumber + 1, m_prevChosenMinibatchSize);
+        chosenMinibatchSize = m_prevChosenMinibatchSize;
+    }
+    else
+    {
+        if (m_prevChosenMinibatchSize != 0)
         {
-            fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
-                    "in epoch %d skipped, keeping minibatchSize of %zd\n",
-                    epochNumber + 1, m_prevChosenMinibatchSize);
-            chosenMinibatchSize = m_prevChosenMinibatchSize;
+            // if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
+            // is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
+            // then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
+            fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
+                            "largest of previous minibatchSize = (%d / 2) or %d\n",
+                    (int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
+            minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
         }
-        else
-        {
-            if (m_prevChosenMinibatchSize != 0)
-            {
-                // if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
-                // is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
-                // then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
-                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
-                        "largest of previous minibatchSize = (%d / 2) or %d\n",
-                        (int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
-                minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
-            }
 
-            size_t maxMinibatchSize = m_minibatchSizeTuningMax;
+        size_t maxMinibatchSize = m_minibatchSizeTuningMax;
 
-            // only grow at most 2 x compared to previous step
-            if (m_prevChosenMinibatchSize != 0.0f)
-            {
-                assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
+        // only grow at most 2 x compared to previous step
+        if (m_prevChosenMinibatchSize != 0.0f)
+        {
+            assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
 
-                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
+            fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
                             "previous minibatchSize %zd*2\n",
                     m_prevChosenMinibatchSize);
-                maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
-            }
-
-            chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber,
-                                                             numFramesToUseInSearch, trainSetDataReader,
-                                                             learnRatePerSample, featureNodes,
-                                                             labelNodes, criterionNodes,
-                                                             evaluationNodes, inputMatrices,
-                                                             learnableNodes, smoothedGradients,
-                                                             minMinibatchSize, maxMinibatchSize);
+            maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
         }
 
-        return chosenMinibatchSize;
+        chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber,
+                                                         numFramesToUseInSearch, trainSetDataReader,
+                                                         learnRatePerSample, featureNodes,
+                                                         labelNodes, criterionNodes,
+                                                         evaluationNodes, inputMatrices,
+                                                         learnableNodes, smoothedGradients,
+                                                         minMinibatchSize, maxMinibatchSize);
     }
 
-    static size_t RoundToMultipleOf64(float val)
-    {
-        return 64 * (size_t)((val + 32) / 64);
-    }
+    return chosenMinibatchSize;
+}
 
-    static size_t RoundToMultipleOf64(size_t val)
-    {
-        return 64 * ((val + 32) / 64);
-    }
+static size_t RoundToMultipleOf64(float val)
+{
+    return 64 * (size_t)((val + 32) / 64);
+}
 
-    // uses a small percentage of training data of minibatch to
-    // speculatively train with various MB sizes; then picks the best
+static size_t RoundToMultipleOf64(size_t val)
+{
+    return 64 * ((val + 32) / 64);
+}
+
+// uses a small percentage of training data of minibatch to
+// speculatively train with various MB sizes; then picks the best
 template <class ElemType>
-    size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
-                                                     ComputationNetworkPtr refNet,
-                                                     const ComputationNodeBasePtr& refNode,
-                                                     const int epochNumber,
-                                                     const size_t numFramesToUseInSearch,
-                                                     IDataReader<ElemType>* trainSetDataReader,
-                                                     const double learnRatePerSample,
+size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
+                                                 ComputationNetworkPtr refNet,
+                                                 const ComputationNodeBasePtr& refNode,
+                                                 const int epochNumber,
+                                                 const size_t numFramesToUseInSearch,
+                                                 IDataReader<ElemType>* trainSetDataReader,
+                                                 const double learnRatePerSample,
                                                  const std::vector<ComputationNodeBasePtr>& featureNodes,
                                                  const std::vector<ComputationNodeBasePtr>& labelNodes,
                                                  const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                                  const std::vector<ComputationNodeBasePtr>& evaluationNodes,
-                                                     std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                                 std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
                                                  const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                                     std::list<Matrix<ElemType>>& smoothedGradients,
-                                                     const size_t minMinibatchSize, const size_t maxMinibatchSize)
+                                                 std::list<Matrix<ElemType>>& smoothedGradients,
+                                                 const size_t minMinibatchSize, const size_t maxMinibatchSize)
+{
+    // may happen for automatically reduced learning rates
+    if (minMinibatchSize > maxMinibatchSize)
     {
-        // may happen for automatically reduced learning rates
-        if (minMinibatchSize > maxMinibatchSize)
-        {
-            return maxMinibatchSize;
-        }
+        return maxMinibatchSize;
+    }
 
-        size_t trialMinibatchSize = 0;
-        bool isFirstIteration = true;
-        double baseCriterion = 0;
+    size_t trialMinibatchSize = 0;
+    bool isFirstIteration = true;
+    double baseCriterion = 0;
 
-        // increase the minibatch size by a factor of sqrt(2) in each step.
-        const float minibatchSizeTuningFactor = sqrtf(2.0f);
+    // increase the minibatch size by a factor of sqrt(2) in each step.
+    const float minibatchSizeTuningFactor = sqrtf(2.0f);
 
-        size_t lastTriedTrialMinibatchSize = 0;
-        double lastTriedTrialEpochCriterion = 0;
+    size_t lastTriedTrialMinibatchSize = 0;
+    double lastTriedTrialEpochCriterion = 0;
     for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
-             trialMinibatchSizeFloat <= maxMinibatchSize;
-             trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
-        {
-            // round mbsize to something meaningful
-            trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
-
-            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
-                    trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
-
-            size_t totalSamplesSeen;
-            std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
-            double epochCriterion = std::numeric_limits<double>::infinity();
-
-            // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
-            // minibatches with iteration of this loop.
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            learnRatePerSample, trialMinibatchSize, featureNodes,
-                                            labelNodes, criterionNodes,
-                                            evaluationNodes, inputMatrices,
-                                            learnableNodes, smoothedGradients,
-                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
-                                            /*out*/ totalSamplesSeen,
+         trialMinibatchSizeFloat <= maxMinibatchSize;
+         trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
+    {
+        // round mbsize to something meaningful
+        trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
+
+        fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
+                trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
+
+        size_t totalSamplesSeen;
+        std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+        double epochCriterion = std::numeric_limits<double>::infinity();
+
+        // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
+        // minibatches with iteration of this loop.
+        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                        numFramesToUseInSearch, trainSetDataReader,
+                                        learnRatePerSample, trialMinibatchSize, featureNodes,
+                                        labelNodes, criterionNodes,
+                                        evaluationNodes, inputMatrices,
+                                        learnableNodes, smoothedGradients,
+                                        /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                        /*out*/ totalSamplesSeen,
                                         isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : "AdaptiveMinibatchSearch:");
 
-            if (isFirstIteration)
-            {
-                // for the first iteration of the loop only, set baseCriterion
-                // to the result we got from TrainOneMiniEpochAndReloadModel().
-                baseCriterion = epochCriterion;
-                lastTriedTrialMinibatchSize = trialMinibatchSize;
-                lastTriedTrialEpochCriterion = baseCriterion;
-                isFirstIteration = false;
-
-                fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
-            }
-            else if (!std::isnan(epochCriterion) &&
+        if (isFirstIteration)
+        {
+            // for the first iteration of the loop only, set baseCriterion
+            // to the result we got from TrainOneMiniEpochAndReloadModel().
+            baseCriterion = epochCriterion;
+            lastTriedTrialMinibatchSize = trialMinibatchSize;
+            lastTriedTrialEpochCriterion = baseCriterion;
+            isFirstIteration = false;
+
+            fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
+        }
+        else if (!std::isnan(epochCriterion) &&
                  (epochCriterion > (baseCriterion * (1.0 + (m_minibatchSearchCriterionErrorMargin / 100.0)))))
+        {
+            // As soon as we see the Criterion (a measure of error) start to get larger than the
+            // Criterion we started with, we stop.
+            // TODO: if this is too sensitive, we can add a margin on the bases of percentage of
+            // baseCriterion.
+            break;
+        }
+        else
+        {
+            lastTriedTrialMinibatchSize = trialMinibatchSize;
+            lastTriedTrialEpochCriterion = epochCriterion;
+            if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
             {
-                // As soon as we see the Criterion (a measure of error) start to get larger than the
-                // Criterion we started with, we stop.
-                // TODO: if this is too sensitive, we can add a margin on the bases of percentage of
-                // baseCriterion.
-                break;
-            }
-            else
-            {
-                lastTriedTrialMinibatchSize = trialMinibatchSize;
-                lastTriedTrialEpochCriterion = epochCriterion;
-                if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
-                {
-                   fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
-                           "EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
-                           epochCriterion, baseCriterion);
-                }
+                fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
+                                "EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
+                        epochCriterion, baseCriterion);
             }
         }
-        fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
-                "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
-                (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
-
-        return lastTriedTrialMinibatchSize;
     }
+    fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
+                    "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
+            (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
 
-    // Attemps to compute the error signal for the whole utterance, which will
-    // be fed to the neural network as features. Currently it is a workaround
-    // for the two-forward-pass sequence and ctc training, which allows
-    // processing more utterances at the same time. Only used in Kaldi2Reader.
-    // TODO: move the two-forward-pass support out of the reader.
+    return lastTriedTrialMinibatchSize;
+}
+
+// Attemps to compute the error signal for the whole utterance, which will
+// be fed to the neural network as features. Currently it is a workaround
+// for the two-forward-pass sequence and ctc training, which allows
+// processing more utterances at the same time. Only used in Kaldi2Reader.
+// TODO: move the two-forward-pass support out of the reader.
 template <class ElemType>
-    void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net,
-                                                           IDataReader<ElemType>* trainSetDataReader,
+void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net,
+                                                       IDataReader<ElemType>* trainSetDataReader,
                                                        const std::vector<ComputationNodeBasePtr>& featureNodes,
-                                                           std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
+                                                       std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
+{
+    assert(trainSetDataReader != NULL);
+    std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
+    auto pMBLayout = make_shared<MBLayout>();
+    // TODO: use GetMinibatchIntoNetwork().
+    while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, pMBLayout))
     {
-        assert(trainSetDataReader != NULL);
-        std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
-        auto pMBLayout = make_shared<MBLayout>();
-        // TODO: use GetMinibatchIntoNetwork().
-        while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, pMBLayout))
-        {
-            ComputationNetwork::BumpEvalTimeStamp(featureNodes);
+        ComputationNetwork::BumpEvalTimeStamp(featureNodes);
 
         auto& outputNodes = net->OutputNodes();
-            if (outputNodes.empty())
-                LogicError("no output node was found.");
+        if (outputNodes.empty())
+            LogicError("no output node was found.");
 
-            //net->SetActualMiniBatchSizeFromFeatures();
-            trainSetDataReader->CopyMBLayoutTo(net->GetMBLayoutPtr());
-            net->VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+        //net->SetActualMiniBatchSizeFromFeatures();
+        trainSetDataReader->CopyMBLayoutTo(net->GetMBLayoutPtr());
+        net->VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
         net->ForwardProp(outputNodes[0]); // Only evaluate the first output
-            trainSetDataReader->SetNetOutput(uttInfo,
-                                             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[0])->Value(),
-                                             pMBLayout);
-        }
+        trainSetDataReader->SetNetOutput(uttInfo,
+                                         dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[0])->Value(),
+                                         pMBLayout);
     }
+}
 
-    static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value)
-    {
-        char format[16];
-        char buffer[512];
+static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value)
+{
+    char format[16];
+    char buffer[512];
 
-        sprintf(format, "%%.%dg", precision);
-        sprintf(buffer, format, value);
+    sprintf(format, "%%.%dg", precision);
+    sprintf(buffer, format, value);
 
-        for (int i = 0; i < strlen(buffer); i++)
+    for (int i = 0; i < strlen(buffer); i++)
+    {
+        if (buffer[i] == 'e' || buffer[i] == 'E')
         {
-            if (buffer[i] == 'e' || buffer[i] == 'E')
-            {
-                sprintf(format, "%%%d.%de", padSize, precision);
-                return format;
-            }
+            sprintf(format, "%%%d.%de", padSize, precision);
+            return format;
         }
-        sprintf(format, "%%%d.%df", padSize, precision);
-        return format;
     }
+    sprintf(format, "%%%d.%df", padSize, precision);
+    return format;
+}
 
 template <class ElemType>
 int SGD<ElemType>::SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...)
+{
+    int result = 0;
+    if (m_traceLevel > 0)
     {
-        int result = 0;
-        if (m_traceLevel > 0)
-        {
-            va_list args;
-            va_start(args, __format);
-            result = vfprintf(__stream, __format, args);
-            va_end(args);
-        }
-        return result;
+        va_list args;
+        va_start(args, __format);
+        result = vfprintf(__stream, __format, args);
+        va_end(args);
     }
+    return result;
+}
 
 template <class ElemType>
-    void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int traceLevel)
+void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int traceLevel)
+{
+    if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
     {
-        if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
+        if (m_distGradAgg == nullptr)
         {
-            if (m_distGradAgg == nullptr)
-            {
 #ifdef QUANTIZED_GRADIENT_AGGREGATION
-                m_distGradAgg = new AllReduceDistGradAggregator<ElemType>(g_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+            m_distGradAgg = new AllReduceDistGradAggregator<ElemType>(g_mpi, m_numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
 #else
-                if (m_numGradientBits != (8 * sizeof(ElemType)))
-                {
-                    RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
-                }
+            if (m_numGradientBits != (8 * sizeof(ElemType)))
+            {
+                RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
+            }
 
-                m_distGradAgg = new SimpleDistGradAggregator<ElemType>(g_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
+            m_distGradAgg = new SimpleDistGradAggregator<ElemType>(g_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
 #endif // !QUANTIZED_GRADIENT_AGGREGATION
-            }
+        }
 
-            if (m_gradHeader == nullptr)
-            {
-                m_gradHeader = DistGradHeader::Create(numEvalNodes);
-            }
+        if (m_gradHeader == nullptr)
+        {
+            m_gradHeader = DistGradHeader::Create(numEvalNodes);
         }
     }
+}
 
 template <class ElemType>
-    bool SGD<ElemType>::ModelAveragingProcessing(size_t nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes, size_t& nProcessedFrames,
-                                  float& SecondsSinceLastSyncFinished, float& SecondsSpentOnSync)
-    {
-        //////////////////////////////////////////////////////////////////////////
-        // the current strategy is that after each minibatch, we will sync between processors 
-        // to decide whether a sync need to be performed. This is definitely not optimal, 
-        // which we will fix it later. 
+bool SGD<ElemType>::ModelAveragingProcessing(size_t nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes, size_t& nProcessedFrames,
+                                             float& SecondsSinceLastSyncFinished, float& SecondsSpentOnSync)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // the current strategy is that after each minibatch, we will sync between processors
+    // to decide whether a sync need to be performed. This is definitely not optimal,
+    // which we will fix it later.
 
-        // TODO: the way we handle timer is not very good 
-        //////////////////////////////////////////////////////////////////////////
+    // TODO: the way we handle timer is not very good
+    //////////////////////////////////////////////////////////////////////////
     static bool first = true;
-        static Timer MAtimer;
-        if (first)
-        {
-            MAtimer.Start(); 
-            first = false; 
-        }
-       
+    static Timer MAtimer;
+    if (first)
+    {
+        MAtimer.Start();
+        first = false;
+    }
+
     char bNeedToSync = (char) 0; // use char for bool
-        if (g_mpi->IsMainNode() && nSamplesSinceLastSync >= m_nFramesBetweenMASync)
-        {
-            // only the main node can decide whether a sync need to be performed 
+    if (g_mpi->IsMainNode() && nSamplesSinceLastSync >= m_nFramesBetweenMASync)
+    {
+        // only the main node can decide whether a sync need to be performed
         bNeedToSync = (char) 1;
-        }
-        g_mpi->Bcast(&bNeedToSync, 1, g_mpi->MainNodeRank());
-        if (bNeedToSync)
-        {
-            MAtimer.Stop();
-            double elapsedsec = MAtimer.ElapsedSeconds(); 
-        SecondsSinceLastSyncFinished = first ? 0 : (float) elapsedsec;
-            MAtimer.Start();
-        nProcessedFrames = ModelAveragingSync((int) nSamplesSinceLastSync, learnableNodes);
-            MAtimer.Stop();
-        SecondsSpentOnSync = (float) MAtimer.ElapsedSeconds();
-            
-            MAtimer.Start();
-        }
-        else
-        {
-            nProcessedFrames = 0; 
-            return false;
-        }
-        return true; 
     }
+    g_mpi->Bcast(&bNeedToSync, 1, g_mpi->MainNodeRank());
+    if (bNeedToSync)
+    {
+        MAtimer.Stop();
+        double elapsedsec = MAtimer.ElapsedSeconds();
+        SecondsSinceLastSyncFinished = first ? 0 : (float) elapsedsec;
+        MAtimer.Start();
+        nProcessedFrames = ModelAveragingSync((int) nSamplesSinceLastSync, learnableNodes);
+        MAtimer.Stop();
+        SecondsSpentOnSync = (float) MAtimer.ElapsedSeconds();
+
+        MAtimer.Start();
+    }
+    else
+    {
+        nProcessedFrames = 0;
+        return false;
+    }
+    return true;
+}
 
 template <class ElemType>
-    size_t SGD<ElemType>::ModelAveragingSync(int nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes)
+size_t SGD<ElemType>::ModelAveragingSync(int nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes)
+{
+    if (g_mpi->NumNodesInUse() <= 1)
     {
-        if (g_mpi->NumNodesInUse() <= 1)
-        {
-            return nSamplesSinceLastSync; 
-        }
+        return nSamplesSinceLastSync;
+    }
 
-        //========================================
-        // Sec. 1 calculate factor
-        //========================================
-        float factor = 0; 
+    //========================================
+    // Sec. 1 calculate factor
+    //========================================
+    float factor = 0;
     int nTotalSamples = nSamplesSinceLastSync;
-        g_mpi->AllReduce(&nTotalSamples, 1);
-        if (nTotalSamples <= 0)
-        {
-            // prepare for overflow 
-            factor = 1.0f / g_mpi->NumNodesInUse(); 
-        }
-        else
-        {
-            factor = (nSamplesSinceLastSync + 0.0f) / nTotalSamples; 
-        }
-
-        //========================================
-        // Sec. 2 sync models based on factor 
-        // Note: this is suboptimal at the moment: 
-        //       we do the averaging for each node in a sequence manner, i.e., 
-        //          (node1) GPU->CPU->MPI_AllReduce -> (node2)GPU->CPU->MPI_AllReduce
-        //       we can improve it by using a pipeline 
-        //          (node1) GPU ->  CPU  ->  MPI_AllReduce
-        //          (node2)         GPU  ->  CPU            -> MPI_AllReduce
-        //          (node3)                  GPU            -> CPU              -> MPI_AllReduce
-        //========================================
-        for (auto iter = learnableNodes.begin(); iter != learnableNodes.end(); iter++)
-        {
-            ComputationNodeBasePtr pNode = *iter; 
-            if (!pNode->IsParameterUpdateRequired())
-                continue;
-
-            Matrix<ElemType>& mat = dynamic_pointer_cast<ComputationNode<ElemType>>(pNode)->Value();
-            // 1. normalize the weight matrix 
-            Matrix<ElemType>::Scale(factor, mat);
-            // 2. send weight matrix over MPI nodes; 
-            ElemType* px = mat.CopyToArray(); 
+    g_mpi->AllReduce(&nTotalSamples, 1);
+    if (nTotalSamples <= 0)
+    {
+        // prepare for overflow
+        factor = 1.0f / g_mpi->NumNodesInUse();
+    }
+    else
+    {
+        factor = (nSamplesSinceLastSync + 0.0f) / nTotalSamples;
+    }
+
+    //========================================
+    // Sec. 2 sync models based on factor
+    // Note: this is suboptimal at the moment:
+    //       we do the averaging for each node in a sequence manner, i.e.,
+    //          (node1) GPU->CPU->MPI_AllReduce -> (node2)GPU->CPU->MPI_AllReduce
+    //       we can improve it by using a pipeline
+    //          (node1) GPU ->  CPU  ->  MPI_AllReduce
+    //          (node2)         GPU  ->  CPU            -> MPI_AllReduce
+    //          (node3)                  GPU            -> CPU              -> MPI_AllReduce
+    //========================================
+    for (auto iter = learnableNodes.begin(); iter != learnableNodes.end(); iter++)
+    {
+        ComputationNodeBasePtr pNode = *iter;
+        if (!pNode->IsParameterUpdateRequired())
+            continue;
+
+        Matrix<ElemType>& mat = dynamic_pointer_cast<ComputationNode<ElemType>>(pNode)->Value();
+        // 1. normalize the weight matrix
+        Matrix<ElemType>::Scale(factor, mat);
+        // 2. send weight matrix over MPI nodes;
+        ElemType* px = mat.CopyToArray();
         size_t nx = mat.GetNumElements();
 
-            // 3. inplace sum 
-            g_mpi->AllReduce(px, nx);
-            mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), px);
-            // 4. clean up 
+        // 3. inplace sum
+        g_mpi->AllReduce(px, nx);
+        mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), px);
+        // 4. clean up
         delete[] px;
-        }
-
-        return nTotalSamples; 
     }
-    
+
+    return nTotalSamples;
+}
+
 // public:
-    // UpdateWeightsS - static version of UpdateWeights()
-    // not static since it wants to access protected methods on the SGD object
+// UpdateWeightsS - static version of UpdateWeights()
+// not static since it wants to access protected methods on the SGD object
 template <class ElemType>
-    /*static*/ void SGD<ElemType>::UpdateWeightsS(const SGD<ElemType>* sgd, Matrix<ElemType>& functionValues,
-                               Matrix<ElemType>& gradientValues,
-                               Matrix<ElemType>& smoothedGradient,
-                               const double learnRatePerSample,
-                               const double momentumPerSample,
-                               size_t actualMBSize,
-                               const double L2RegWeight,
-                               const double L1RegWeight,
-                               const bool needAveMultiplier, 
+/*static*/ void SGD<ElemType>::UpdateWeightsS(const SGD<ElemType>* sgd, Matrix<ElemType>& functionValues,
+                                              Matrix<ElemType>& gradientValues,
+                                              Matrix<ElemType>& smoothedGradient,
+                                              const double learnRatePerSample,
+                                              const double momentumPerSample,
+                                              size_t actualMBSize,
+                                              const double L2RegWeight,
+                                              const double L1RegWeight,
+                                              const bool needAveMultiplier,
                                               const bool useNesterovMomentum)
-    {
-        // we use simple linear (instead of log linear) scaling here
-        const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
+{
+    // we use simple linear (instead of log linear) scaling here
+    const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
 #if DUMPOUTPUT
-        fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
-                learnRatePerSample, momentum, actualMBSize);
-        fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
-                sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
-        gradientValues.Print("Gradient Input");
-        smoothedGradient.Print("Smoothed Gradient Input");
+    fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+            learnRatePerSample, momentum, actualMBSize);
+    fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
+            sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
+    gradientValues.Print("Gradient Input");
+    smoothedGradient.Print("Smoothed Gradient Input");
 #endif
 
-        // make actualMBSize is a valid value
-        assert(actualMBSize > 0);
+    // make actualMBSize is a valid value
+    assert(actualMBSize > 0);
 
-        //clipping gradients to prevent outliers
-        sgd->ClipGradient(gradientValues, actualMBSize);
+    //clipping gradients to prevent outliers
+    sgd->ClipGradient(gradientValues, actualMBSize);
 
-        GradientsUpdateType adpType = sgd->GradUpdateType();
-        double noiseStd = sgd->GradientUpdateNoiseStd();
+    GradientsUpdateType adpType = sgd->GradUpdateType();
+    double noiseStd = sgd->GradientUpdateNoiseStd();
     Matrix<ElemType> sgdUpdateNoise((DEVICEID_TYPE) functionValues.GetDeviceId());
-        if (noiseStd > 0)
-        {
-            // get the gradient structure since gradient is sparse
-            sgdUpdateNoise.SetValue(gradientValues);
+    if (noiseStd > 0)
+    {
+        // get the gradient structure since gradient is sparse
+        sgdUpdateNoise.SetValue(gradientValues);
 
-            // reset its value to random
+        // reset its value to random
         sgdUpdateNoise.SetGaussianRandomValue(0, (ElemType) noiseStd);
-        }
+    }
 
-        // L2 regularizer
-        if (L2RegWeight > 0)
-        {
-            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(L2RegWeight * actualMBSize), functionValues, gradientValues);
-        }
+    // L2 regularizer
+    if (L2RegWeight > 0)
+    {
+        // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+        Matrix<ElemType>::ScaleAndAdd((ElemType)(L2RegWeight * actualMBSize), functionValues, gradientValues);
+    }
 
-        if (adpType == GradientsUpdateType::None)
-        {
-            smoothedGradient.NormalGrad(gradientValues, functionValues,
+    if (adpType == GradientsUpdateType::None)
+    {
+        smoothedGradient.NormalGrad(gradientValues, functionValues,
                                     (ElemType) learnRatePerSample, (ElemType) momentum, useNesterovMomentum);
-        }
-        else if (adpType == GradientsUpdateType::AdaGrad ||
-                (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
-                (adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE))
-        {
-            //rmsprop for sparse is not implemented yet, delegate it with adagrad
+    }
+    else if (adpType == GradientsUpdateType::AdaGrad ||
+             (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
+             (adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE))
+    {
+        //rmsprop for sparse is not implemented yet, delegate it with adagrad
 
-            double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
-        }
-        else if (adpType == GradientsUpdateType::FSAdaGrad)
-        {
+        double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
+        Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
+    }
+    else if (adpType == GradientsUpdateType::FSAdaGrad)
+    {
         smoothedGradient.FSAdagrad(actualMBSize, gradientValues, functionValues, (ElemType) learnRatePerSample, (ElemType) momentum);
-        }
-        else if (adpType == GradientsUpdateType::RmsProp)
-        {
+    }
+    else if (adpType == GradientsUpdateType::RmsProp)
+    {
         double aveMultiplier = smoothedGradient.RmsProp(gradientValues, (ElemType) sgd->m_rpi.gamma,
                                                         (ElemType) sgd->m_rpi.inc, (ElemType) sgd->m_rpi.max,
                                                         (ElemType) sgd->m_rpi.dec, (ElemType) sgd->m_rpi.min, needAveMultiplier);
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
-        }
+        Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
+    }
 
-        if (noiseStd > 0)
-        {
-            Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
-        }
+    if (noiseStd > 0)
+    {
+        Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
+    }
 
-        // L1 regularizer with proximal gradient descent method
-        if (L1RegWeight > 0)
-        {
-            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
-            functionValues.InplaceSoftThreshold((ElemType)(learnRatePerSample * L1RegWeight * actualMBSize));
-        }
+    // L1 regularizer with proximal gradient descent method
+    if (L1RegWeight > 0)
+    {
+        // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+        functionValues.InplaceSoftThreshold((ElemType)(learnRatePerSample * L1RegWeight * actualMBSize));
+    }
 
 #if DUMPOUTPUT
-        functionValues.Print("Parameter Update");
+    functionValues.Print("Parameter Update");
 #endif
-    }
+}
 
 // protected:
 
-    // UpdateWeights - update the weights in
+// UpdateWeights - update the weights in
 template <class ElemType>
-    void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr& node,
-                       Matrix<ElemType>& smoothedGradient,
-                       const double learnRatePerSample,
-                       const double momentumPerSample,
-                       const size_t actualMBSize,
-                       const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier, 
+void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr& node,
+                                  Matrix<ElemType>& smoothedGradient,
+                                  const double learnRatePerSample,
+                                  const double momentumPerSample,
+                                  const size_t actualMBSize,
+                                  const double L2RegWeight, const double L1RegWeight,
+                                  const bool needAveMultiplier,
                                   const bool useNesterovMomentum) const
-    {
+{
 #if DUMPOUTPUT
-        fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
+    fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
 #endif
-        if (!node->IsParameterUpdateRequired())
-            LogicError("UpdateWeights() called for a learnable ComputationNode which has m_parameterUpdateRequired == false!");
-
-        UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
-                       smoothedGradient, learnRatePerSample, momentumPerSample,
-                       actualMBSize, L2RegWeight, L1RegWeight,
-                       needAveMultiplier, m_useNesterovMomentum);
-        node->BumpEvalTimeStamp();
-    }
+    if (!node->IsParameterUpdateRequired())
+        LogicError("UpdateWeights() called for a learnable ComputationNode which has m_parameterUpdateRequired == false!");
+
+    UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
+                   smoothedGradient, learnRatePerSample, momentumPerSample,
+                   actualMBSize, L2RegWeight, L1RegWeight,
+                   needAveMultiplier, m_useNesterovMomentum);
+    node->BumpEvalTimeStamp();
+}
 
 template <class ElemType>
-    void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+{
+    if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
     {
-        if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
+        double maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
+        if (m_gradientClippingWithTruncation)
+            gradient.InplaceTruncate((ElemType)(maxGradientPerMB));
+        else
         {
-            double maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
-            if (m_gradientClippingWithTruncation)
-                gradient.InplaceTruncate((ElemType)(maxGradientPerMB));
-            else
+            // norm2 normalized
+            double gradientNorm = gradient.FrobeniusNorm();
+            if (gradientNorm > maxGradientPerMB)
             {
-                // norm2 normalized
-                double gradientNorm = gradient.FrobeniusNorm();
-                if (gradientNorm > maxGradientPerMB)
-                {
-                    double normFactor = maxGradientPerMB / gradientNorm;
+                double normFactor = maxGradientPerMB / gradientNorm;
                 gradient *= (ElemType) normFactor;
-                }
             }
         }
     }
+}
 
 template <class ElemType>
-    void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
-                            const double learnRatePerSample,
-                            const std::list<Matrix<ElemType>>& smoothedGradients,
-                            const double prevCriterion,
-                            const size_t minibatchSize)
-    {
-        // In case of parallel training only the main node should we saving the checkpoint to prevent
-        // the parallel training nodes from colliding to write the same file
-        if ((g_mpi == nullptr) || g_mpi->IsMainNode())
-        {
-            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
-            // Saving into temporary file and then renaming it to the checkPointFileName
-            // This is a standard trick to avoid havign corrupted checkpoints files if process dies during writing
-            wstring tempFileName = checkPointFileName + L".tmp";
-
-            {
-                File fstream(tempFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
+                                       const double learnRatePerSample,
+                                       const std::list<Matrix<ElemType>>& smoothedGradients,
+                                       const double prevCriterion,
+                                       const size_t minibatchSize)
+{
+    // In case of parallel training only the main node should we saving the checkpoint to prevent
+    // the parallel training nodes from colliding to write the same file
+    if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+    {
+        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
+        // Saving into temporary file and then renaming it to the checkPointFileName
+        // This is a standard trick to avoid havign corrupted checkpoints files if process dies during writing
+        wstring tempFileName = checkPointFileName + L".tmp";
 
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-                fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+        {
+            File fstream(tempFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
 
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
-                fstream << minibatchSize;
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+            fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
 
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
+            fstream << minibatchSize;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
 
-                for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-                {
-                    const Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
-                    fstream << smoothedGradient;
-                }
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+            for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+            {
+                const Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+                fstream << smoothedGradient;
+            }
 
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
 
-                // Ensuring that data is written
-                fstream.Flush();
-            }
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
 
-            renameOrDie(tempFileName, checkPointFileName);
+            // Ensuring that data is written
+            fstream.Flush();
         }
+
+        renameOrDie(tempFileName, checkPointFileName);
     }
+}
 
 template <class ElemType>
-    bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
-                            /*out*/ size_t& totalSamplesSeen,
-                            /*out*/ double& learnRatePerSample,
-                            std::list<Matrix<ElemType>>& smoothedGradients,
-                            /*out*/ double& prevCriterion,
-                            /*out*/ size_t& minibatchSize)
+bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
+                                       /*out*/ size_t& totalSamplesSeen,
+                                       /*out*/ double& learnRatePerSample,
+                                       std::list<Matrix<ElemType>>& smoothedGradients,
+                                       /*out*/ double& prevCriterion,
+                                       /*out*/ size_t& minibatchSize)
+{
+    wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
+    if (!fexists(checkPointFileName.c_str()))
     {
-        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
-        if (!fexists(checkPointFileName.c_str()))
-        {
-            fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
-            return false;
-        }
+        fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
+        return false;
+    }
 
-        File fstream(checkPointFileName,
-                     FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+    File fstream(checkPointFileName,
+                 FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
+    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
 
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-        fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+    fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
+    fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
 
-        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
-        {
-            fstream >> minibatchSize;
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
-        }
-        else
-        {
-            minibatchSize = m_mbSize[epochNumber];
-        }
+    if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
+    {
+        fstream >> minibatchSize;
+        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+    }
+    else
+    {
+        minibatchSize = m_mbSize[epochNumber];
+    }
 
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+    fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
-        for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-        {
-            Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
-            fstream >> smoothedGradient;
-        }
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+    for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+    {
+        Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+        fstream >> smoothedGradient;
+    }
+    fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
 
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+    fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
 
-        return true;
-    }
+    return true;
+}
 
 template <class ElemType>
-    wstring SGD<ElemType>::GetCheckPointFileNameForEpoch(const int epoch)
-    {
-        return GetModelNameForEpoch(epoch) + L".ckp";
-    }
+wstring SGD<ElemType>::GetCheckPointFileNameForEpoch(const int epoch)
+{
+    return GetModelNameForEpoch(epoch) + L".ckp";
+}
 
 template <class ElemType>
-    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
+wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
+{
+    int epoch1Base = epoch + 1;
+    if (epoch1Base == m_maxEpochs || bLastModel)
+    {
+        return m_modelPath;
+    }
+    else
     {
-        int epoch1Base = epoch + 1;
-        if (epoch1Base == m_maxEpochs || bLastModel)
-        {
-            return m_modelPath;
-        }
-        else
-        {
         wstring w = msra::strfun::wstrprintf(L"%ls.%d", m_modelPath.c_str(), (int) epoch1Base);
-            return w;
-        }
+        return w;
     }
+}
 
-    // return -1 if nothing exists
+// return -1 if nothing exists
 template <class ElemType> // TODO: needed?
-    int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
+int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
+{
+    if (!makeMode)
     {
-        if (!makeMode)
-        {
-            // always start from scratch
-            return -1;
-        }
+        // always start from scratch
+        return -1;
+    }
 
-        int firstEpoch = -1;
+    int firstEpoch = -1;
 
-        wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1);
-        for (int e = int(m_maxEpochs) - 1; e >= -1; e--)
-        {
-            const wstring prevEpochFile = GetModelNameForEpoch(e - 1);
+    wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1);
+    for (int e = int(m_maxEpochs) - 1; e >= -1; e--)
+    {
+        const wstring prevEpochFile = GetModelNameForEpoch(e - 1);
 
-            if (msra::files::fuptodate(curEpochFile, prevEpochFile, false))
-            {
-                firstEpoch = e + 1;
-                break;
-            }
-            else
-            {
-                curEpochFile = prevEpochFile;
-            }
+        if (msra::files::fuptodate(curEpochFile, prevEpochFile, false))
+        {
+            firstEpoch = e + 1;
+            break;
+        }
+        else
+        {
+            curEpochFile = prevEpochFile;
         }
-        if (firstEpoch == m_maxEpochs)
-            fprintf(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
-
-        return firstEpoch;
     }
+    if (firstEpoch == m_maxEpochs)
+        fprintf(stderr, "Final model exists: %ls\n", GetModelNameForEpoch(firstEpoch - 1).c_str());
+
+    return firstEpoch;
+}
 
 #define EPSILON 1e-5
 
-    // this probes the automatic gradient computation with random inputs
+// this probes the automatic gradient computation with random inputs
 template <class ElemType>
-    bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
+bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
                                   const std::vector<ComputationNodeBasePtr>& criterionNodes,
                                   const std::list<ComputationNodeBasePtr>& learnableNodes,
-                                      int npos)
-    {
-        vector<string> errMsgs;
+                                  int npos)
+{
+    vector<string> errMsgs;
 
-        net->StartEvaluateMinibatchLoop(criterionNodes[npos]);
+    net->StartEvaluateMinibatchLoop(criterionNodes[npos]);
 
-        // gradient checking
-        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-        {
-            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-            char wstrtmp[2048];
+    // gradient checking
+    for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+    {
+        ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+        char wstrtmp[2048];
 
         for (size_t itry = 0; itry < min((size_t) 50, node->Value().GetNumElements()); itry++)
-            {
-                /// no support to sparse matrix yet
-                int irow = (int) fmod(rand(), node->Gradient().GetNumRows() - 1);
-                int icol = (int) fmod(rand(), node->Gradient().GetNumCols() - 1);
-                irow = max(0, irow);
-                icol = max(0, icol);
+        {
+            /// no support to sparse matrix yet
+            int irow = (int) fmod(rand(), node->Gradient().GetNumRows() - 1);
+            int icol = (int) fmod(rand(), node->Gradient().GetNumCols() - 1);
+            irow = max(0, irow);
+            icol = max(0, icol);
 
-                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
+            fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
 
-                double eOrg = node->Value()(irow, icol);
-                node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
+            double eOrg = node->Value()(irow, icol);
+            node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
 
-                node->BumpEvalTimeStamp();
+            node->BumpEvalTimeStamp();
 
-                net->ForwardProp(criterionNodes[npos]);
-                net->Backprop(criterionNodes[npos]);
+            net->ForwardProp(criterionNodes[npos]);
+            net->Backprop(criterionNodes[npos]);
 
-                if (node->Gradient().GetMatrixType() == MatrixType::SPARSE)
-                {
-                    break;
-                }
+            if (node->Gradient().GetMatrixType() == MatrixType::SPARSE)
+            {
+                break;
+            }
 
-                //double mbEvalCri =
-                //criterionNode should be a scalar
-                // TODO: why is this value not used?
-                criterionNodes[npos]->Get00Element();
-                double eGradErr = node->Gradient()(irow, icol);
-                node->Gradient().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
+            //double mbEvalCri =
+            //criterionNode should be a scalar
+            // TODO: why is this value not used?
+            criterionNodes[npos]->Get00Element();
+            double eGradErr = node->Gradient()(irow, icol);
+            node->Gradient().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
 
-                double ePos = eOrg + EPSILON;
-                double eNeg = eOrg - EPSILON;
+            double ePos = eOrg + EPSILON;
+            double eNeg = eOrg - EPSILON;
 
             node->Value()(irow, icol) = (ElemType) ePos;
-                node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
+            node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
 
-                node->BumpEvalTimeStamp();
-                net->ForwardProp(criterionNodes[npos]);
-                //criterionNode should be a scalar
+            node->BumpEvalTimeStamp();
+            net->ForwardProp(criterionNodes[npos]);
+            //criterionNode should be a scalar
 
-                double mbEvalCriPos = criterionNodes[npos]->Get00Element(); // TODO: make Get00Element() a function of ComputationNodeBase
+            double mbEvalCriPos = criterionNodes[npos]->Get00Element(); // TODO: make Get00Element() a function of ComputationNodeBase
 
             node->Value()(irow, icol) = (ElemType) eNeg;
-                node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
+            node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
 
-                node->BumpEvalTimeStamp();
-                net->ForwardProp(criterionNodes[npos]);
+            node->BumpEvalTimeStamp();
+            net->ForwardProp(criterionNodes[npos]);
 
-                // criterionNode should be a scalar
-                double mbEvalCriNeg = criterionNodes[npos]->Get00Element();
+            // criterionNode should be a scalar
+            double mbEvalCriNeg = criterionNodes[npos]->Get00Element();
 
-                // back to its original parameter value
+            // back to its original parameter value
             node->Value()(irow, icol) = (ElemType) eOrg;
-                node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
+            node->Value().TransferToDeviceIfNotThere(net->GetDeviceId(), true);
 
-                // check if they are consistent
-                double eGradNum = ((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
-                double threshold = pow(10.0,
-                                       max(0.0,
-                                           ceil(log10(min(fabs(eGradErr),
+            // check if they are consistent
+            double eGradNum = ((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
+            double threshold = pow(10.0,
+                                   max(0.0,
+                                       ceil(log10(min(fabs(eGradErr),
                                                       fabs(eGradNum))))) -
                                        (int) m_gradientCheckSigDigit);
-                double diff = fabs(eGradErr - eGradNum);
-                bool wrong = (std::isnan(diff) || diff > threshold);
-                if (wrong)
-                {
-                    fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
-                            node->NodeName().c_str(), eGradNum, eGradErr);
-                    sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
-                            node->NodeName().c_str(), eGradNum, eGradErr);
-                    errMsgs.push_back(wstrtmp);
-                }
+            double diff = fabs(eGradErr - eGradNum);
+            bool wrong = (std::isnan(diff) || diff > threshold);
+            if (wrong)
+            {
+                fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
+                        node->NodeName().c_str(), eGradNum, eGradErr);
+                sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
+                        node->NodeName().c_str(), eGradNum, eGradErr);
+                errMsgs.push_back(wstrtmp);
             }
         }
-
-        return errMsgs.size() == 0;
     }
 
-    template class SGD<float>;
-    template class SGD<double>;
+    return errMsgs.size() == 0;
+}
+
+template class SGD<float>;
+template class SGD<double>;
 
-    // =======================================================================
-    // class SGDParams
-    // =======================================================================
+// =======================================================================
+// class SGDParams
+// =======================================================================
 
 static AdaptationRegType ParseAdaptationRegType(const wstring& s)
-    {
-        if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none"))
-            return AdaptationRegType::None;
-        else if (!_wcsicmp(s.c_str(), L"kl") || !_wcsicmp(s.c_str(), L"klReg"))
-            return AdaptationRegType::KL;
-        else
-            InvalidArgument("ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are (none | kl)");
-        }
+{
+    if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none"))
+        return AdaptationRegType::None;
+    else if (!_wcsicmp(s.c_str(), L"kl") || !_wcsicmp(s.c_str(), L"klReg"))
+        return AdaptationRegType::KL;
+    else
+        InvalidArgument("ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are (none | kl)");
+}
 
 static GradientsUpdateType ParseGradUpdateType(const wstring& s)
-    {
-        if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none") || !_wcsicmp(s.c_str(), L"normal") || !_wcsicmp(s.c_str(), L"simple"))
-            return GradientsUpdateType::None;
-        else if (!_wcsicmp(s.c_str(), L"adagrad"))
-            return GradientsUpdateType::AdaGrad;
-        else if (!_wcsicmp(s.c_str(), L"rmsProp"))
-            return GradientsUpdateType::RmsProp;
-        else if (!_wcsicmp(s.c_str(), L"fsAdagrad"))
-            return GradientsUpdateType::FSAdaGrad;
-        else
-            InvalidArgument("ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are (none | adagrad | rmsProp | fsAdagrad )");
-    }
+{
+    if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none") || !_wcsicmp(s.c_str(), L"normal") || !_wcsicmp(s.c_str(), L"simple"))
+        return GradientsUpdateType::None;
+    else if (!_wcsicmp(s.c_str(), L"adagrad"))
+        return GradientsUpdateType::AdaGrad;
+    else if (!_wcsicmp(s.c_str(), L"rmsProp"))
+        return GradientsUpdateType::RmsProp;
+    else if (!_wcsicmp(s.c_str(), L"fsAdagrad"))
+        return GradientsUpdateType::FSAdaGrad;
+    else
+        InvalidArgument("ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are (none | adagrad | rmsProp | fsAdagrad )");
+}
 
 static ParallelizationMethod ParseParallelizationMethod(const wstring& s)
-    {
-        if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none"))
-            return ParallelizationMethod::None;
-        else if (!_wcsicmp(s.c_str(), L"DataParallelSGD"))
-            return ParallelizationMethod::DataParallelSGD;
-        else if (!_wcsicmp(s.c_str(), L"ModelAveragingSGD"))
-            return ParallelizationMethod::ModelAveragingSGD;
-        else
-            InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | dataParallelSGD | modelAveragingSGD)");
-    }
+{
+    if (!_wcsicmp(s.c_str(), L"") || !_wcsicmp(s.c_str(), L"none"))
+        return ParallelizationMethod::None;
+    else if (!_wcsicmp(s.c_str(), L"DataParallelSGD"))
+        return ParallelizationMethod::DataParallelSGD;
+    else if (!_wcsicmp(s.c_str(), L"ModelAveragingSGD"))
+        return ParallelizationMethod::ModelAveragingSGD;
+    else
+        InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | dataParallelSGD | modelAveragingSGD)");
+}
 
 static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
-    {
-        // TODO: why allow so many variants?
-        if (!_wcsicmp(s.c_str(), L"false") || !_wcsicmp(s.c_str(), L"none"))
-            return LearningRateSearchAlgorithm::None;
+{
+    // TODO: why allow so many variants?
+    if (!_wcsicmp(s.c_str(), L"false") || !_wcsicmp(s.c_str(), L"none"))
+        return LearningRateSearchAlgorithm::None;
     else if (!_wcsicmp(s.c_str(), L"searchBeforeEpoch") || !_wcsicmp(s.c_str(), L"beforeEpoch" /*legacy, deprecated*/) || !_wcsicmp(s.c_str(), L"before" /*legacy, deprecated*/))
-            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
+        return LearningRateSearchAlgorithm::SearchBeforeEpoch;
     else if (!_wcsicmp(s.c_str(), L"adjustAfterEpoch") || !_wcsicmp(s.c_str(), L"afterEpoch" /*legacy, deprecated*/) || !_wcsicmp(s.c_str(), L"after" /*legacy, deprecated*/))
-            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
-        else
-            InvalidArgument("autoAdjustLR: Invalid learning rate search type. Valid values are (none | searchBeforeEpoch | adjustAfterEpoch)");
-    }
+        return LearningRateSearchAlgorithm::AdjustAfterEpoch;
+    else
+        InvalidArgument("autoAdjustLR: Invalid learning rate search type. Valid values are (none | searchBeforeEpoch | adjustAfterEpoch)");
+}
 
 template <class ConfigRecordType>
-    SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
-    {
-        floatargvector learningRatesPerMB = configSGD(L"learningRatesPerMB", ConfigRecordType::Array(floatargvector()));
+SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
+{
+    floatargvector learningRatesPerMB = configSGD(L"learningRatesPerMB", ConfigRecordType::Array(floatargvector()));
 
-        floatargvector learningRatesPerSample = configSGD(L"learningRatesPerSample", ConfigRecordType::Array(floatargvector()));
+    floatargvector learningRatesPerSample = configSGD(L"learningRatesPerSample", ConfigRecordType::Array(floatargvector()));
 
-        string executionEngineValue = configSGD(L"executionEngine", "synchronous");
+    string executionEngineValue = configSGD(L"executionEngine", "synchronous");
 
-        // AutoAdjust Parameters
+    // AutoAdjust Parameters
     const ConfigRecordType& configAALR(configSGD(L"AutoAdjust", ConfigRecordType::Record()));
-        m_autoLearnRateSearchType = ParseLearningRateSearchType(configAALR(L"autoAdjustLR", L"None"));
+    m_autoLearnRateSearchType = ParseLearningRateSearchType(configAALR(L"autoAdjustLR", L"None"));
     m_reduceLearnRateIfImproveLessThan = configAALR(L"reduceLearnRateIfImproveLessThan", 0.0);
     m_continueReduce = configAALR(L"continueReduce", false);
     m_learnRateAdjustInterval = configAALR(L"learnRateAdjustInterval", (size_t) 1);
     m_learnRateAdjustInterval = max((size_t) 1, m_learnRateAdjustInterval); //minimum interval is 1 epoch
-        m_learnRateDecreaseFactor = configAALR(L"learnRateDecreaseFactor", 0.618);
-        m_increaseLearnRateIfImproveMoreThan = configAALR(L"increaseLearnRateIfImproveMoreThan", numeric_limits<double>::infinity());
+    m_learnRateDecreaseFactor = configAALR(L"learnRateDecreaseFactor", 0.618);
+    m_increaseLearnRateIfImproveMoreThan = configAALR(L"increaseLearnRateIfImproveMoreThan", numeric_limits<double>::infinity());
     m_learnRateIncreaseFactor = configAALR(L"learnRateIncreaseFactor", 1.382);
 
-        // AutoAdjust Auto Adjust Minibatch Parameters
+    // AutoAdjust Auto Adjust Minibatch Parameters
     m_autoAdjustMinibatch = configAALR(L"autoAdjustMinibatch", false);
     m_minibatchSizeTuningFrequency = configAALR(L"minibatchSizeTuningFrequency", (size_t) 1);
     m_minibatchSizeTuningMax = configAALR(L"minibatchSizeTuningMax", (size_t) 1048576);
     m_minibatchSearchCriterionErrorMargin = configAALR(L"minibatchSearchCriterionErrorMargin", (size_t) 1);
 
-        // the number of minibatches used to search
-        // the learning rate. Its typically set to 10-20% of
-        // the total minibatches in an epoch.
+    // the number of minibatches used to search
+    // the learning rate. Its typically set to 10-20% of
+    // the total minibatches in an epoch.
     m_numMiniBatch4LRSearch = configAALR(L"numMiniBatch4LRSearch", ConfigRecordType::Array(intargvector(vector<int>{500})));
 
     m_numPrevLearnRates = configAALR(L"numPrevLearnRates", (size_t) 5);
     m_numBestSearchEpoch = configAALR(L"numBestSearchEpoch", (size_t) 1);
     m_loadBestModel = configAALR(L"loadBestModel", true);
-        m_useCVSetControlLRIfCVExists = configAALR(L"UseCVSetControlLRIfCVExists", true);
+    m_useCVSetControlLRIfCVExists = configAALR(L"UseCVSetControlLRIfCVExists", true);
     m_useEvalCriterionControlLR = configAALR(L"UseEvalCriterionControlLR", false);
 
-        // TODO: mbSize and truncated should be specified differently for truncated BPTT:
-        //       mbSize = total number of samples after which a model update should happen
-        //       truncated = truncation length
+    // TODO: mbSize and truncated should be specified differently for truncated BPTT:
+    //       mbSize = total number of samples after which a model update should happen
+    //       truncated = truncation length
     m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{256})));
-        m_truncated = configSGD(L"truncated", false);
+    m_truncated = configSGD(L"truncated", false);
     m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t) SIZE_MAX);
     m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t) 1);
 
-        // the number of samples in each epoch (0 means, use all the samples in each epoch).
+    // the number of samples in each epoch (0 means, use all the samples in each epoch).
     m_epochSize = configSGD(L"epochSize", (size_t) 0);
-        // the number of samples in each epoch (0 means, use all the samples in each epoch).
-        if (m_epochSize == 0)
-            m_epochSize = requestDataSize;
-        m_maxComputedEpochSize = m_epochSize;
-
-        // the total number of epochs to run.
-        m_maxEpochs = configSGD(L"maxEpochs");
-
-        // Note: Momentum is best specified as a MB-size agnostic fashion.
-        // Because momentum per sample is a number very close to 1, it is more handy to use a logarithmic specification.
-        // We use 'momentumAsTimeConstant' to specify the time constant of the low-pass filter that momentum really is.
-        // To convert a typical per-MB momentum value of 'm' used with a MB size of 'N', use momentumAsTimeConstant = -N/ln(m).
-        // For the common configuration of momentum 0.9 at MB size of 256, that is momentumAsTimeConstant = 2429.8.
+    // the number of samples in each epoch (0 means, use all the samples in each epoch).
+    if (m_epochSize == 0)
+        m_epochSize = requestDataSize;
+    m_maxComputedEpochSize = m_epochSize;
+
+    // the total number of epochs to run.
+    m_maxEpochs = configSGD(L"maxEpochs");
+
+    // Note: Momentum is best specified as a MB-size agnostic fashion.
+    // Because momentum per sample is a number very close to 1, it is more handy to use a logarithmic specification.
+    // We use 'momentumAsTimeConstant' to specify the time constant of the low-pass filter that momentum really is.
+    // To convert a typical per-MB momentum value of 'm' used with a MB size of 'N', use momentumAsTimeConstant = -N/ln(m).
+    // For the common configuration of momentum 0.9 at MB size of 256, that is momentumAsTimeConstant = 2429.8.
     floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
     floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
-        floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
+    floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
     bool useNesterovMomentum = configSGD(L"useNAG", false);
 
     m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t) 0);
@@ -2553,210 +2553,210 @@ template <class ConfigRecordType>
     m_numMBsToShowResult = configSGD(L"numMBsToShowResult", (size_t) 10);
     m_numMBsToCUDAProfile = configSGD(L"numMBsToCUDAProfile", (size_t) 0);
 
-        m_gradientClippingWithTruncation = configSGD(L"gradientClippingWithTruncation", true);
+    m_gradientClippingWithTruncation = configSGD(L"gradientClippingWithTruncation", true);
     m_clippingThresholdPerSample = configSGD(L"clippingThresholdPerSample", numeric_limits<double>::infinity());
 
-        // sequence-training parameters
-        m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
+    // sequence-training parameters
+    m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
     m_frameDropThresh = configSGD(L"frameDropThresh", 1e-10);
-        m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
-        m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); 
-        m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
-        m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
-        m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); 
-        m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);
+    m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
+    m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false);
+    m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
+    m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
+    m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0);
+    m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);
 
     m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector<float>{0.0f})));
 
-        GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD(L"gradUpdateType", L"None"));
-        double gaussianNoiseInjecStd = configSGD(L"gaussianNoiseInjectStd", 0.0);
-        m_gradType.mType = gradUpdateType;
-        m_gradType.mGaussianNoiseInjectStd = (float) gaussianNoiseInjecStd;
+    GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD(L"gradUpdateType", L"None"));
+    double gaussianNoiseInjecStd = configSGD(L"gaussianNoiseInjectStd", 0.0);
+    m_gradType.mType = gradUpdateType;
+    m_gradType.mGaussianNoiseInjectStd = (float) gaussianNoiseInjecStd;
 
-        // extract RMSProp parameters from config, if they exist. Default to reasonable values.
+    // extract RMSProp parameters from config, if they exist. Default to reasonable values.
     m_rpi.dec = configSGD(L"rms_wgt_dec", 0.75);
     m_rpi.inc = configSGD(L"rms_wgt_inc", 1.2);
     m_rpi.min = configSGD(L"rms_wgt_min", 0.1);
     m_rpi.max = configSGD(L"rms_wgt_max", 10.0);
     m_rpi.gamma = configSGD(L"rms_gamma", 0.99);
 
-        m_needAveMultiplier = configSGD(L"normWithAveMultiplier", true);
+    m_needAveMultiplier = configSGD(L"normWithAveMultiplier", true);
     m_L2RegWeight = configSGD(L"L2RegWeight", 0.0);
     m_L1RegWeight = configSGD(L"L1RegWeight", 0.0);
 
-        /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of
-        /// useAdagrad=true
-        bool useAdagrad = configSGD(L"useAdagrad", false);
-        if (useAdagrad)
-        {
-            gradUpdateType = GradientsUpdateType::AdaGrad;
-            m_gradType.mType = gradUpdateType;
-        }
+    /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of
+    /// useAdagrad=true
+    bool useAdagrad = configSGD(L"useAdagrad", false);
+    if (useAdagrad)
+    {
+        gradUpdateType = GradientsUpdateType::AdaGrad;
+        m_gradType.mType = gradUpdateType;
+    }
 
-        m_adaptationRegType = ParseAdaptationRegType(configSGD(L"adaptationRegType", L"None"));
-        m_adaptationRegWeight = configSGD(L"adaptationRegWeight", 0.0);
+    m_adaptationRegType = ParseAdaptationRegType(configSGD(L"adaptationRegType", L"None"));
+    m_adaptationRegWeight = configSGD(L"adaptationRegWeight", 0.0);
 
-        /// gradient check setup
+    /// gradient check setup
     m_doGradientCheck = configSGD(L"gradientcheck", false);
     m_gradientCheckSigDigit = configSGD(L"sigFigs", 6.0); // TODO: why is this a double?
 
-        if (m_doGradientCheck && sizeofElemType != sizeof(double))
-        {
-            LogicError("Gradient check needs to use precision = 'double'.");
-        }
+    if (m_doGradientCheck && sizeofElemType != sizeof(double))
+    {
+        LogicError("Gradient check needs to use precision = 'double'.");
+    }
 
-        m_useAllDataForPreComputedNode = configSGD(L"UseAllDataForPreComputedNode", true);
+    m_useAllDataForPreComputedNode = configSGD(L"UseAllDataForPreComputedNode", true);
 
-        // consistency checks
-        for (size_t i = 0; i < m_mbSize.size(); i++)
+    // consistency checks
+    for (size_t i = 0; i < m_mbSize.size(); i++)
+    {
+        if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
         {
-            if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
-            {
-                InvalidArgument("epoch size must be larger than mbsize.");
-            }
+            InvalidArgument("epoch size must be larger than mbsize.");
         }
+    }
 
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None &&
-            (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
-        {
-            InvalidArgument("If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter.");
-        }
+    if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None &&
+        (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
+    {
+        InvalidArgument("If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter.");
+    }
 
-        if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
-        {
-            InvalidArgument("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them.");
-        }
+    if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
+    {
+        InvalidArgument("You specified both learningRatesPerSample and learningRatesPerMB. Please comment out one of them.");
+    }
 
-        if (learningRatesPerSample.size() > 0)
-        {
-            m_learningRatesParam = learningRatesPerSample;
-            m_learningRatesSpecifiedForMBSize = intargvector(L"1");
-        }
+    if (learningRatesPerSample.size() > 0)
+    {
+        m_learningRatesParam = learningRatesPerSample;
+        m_learningRatesSpecifiedForMBSize = intargvector(L"1");
+    }
     else if (learningRatesPerMB.size() > 0) // this actually means per specified minibatch size
-        {
-            m_learningRatesParam = learningRatesPerMB;
-            m_learningRatesSpecifiedForMBSize = m_mbSize;
-        }
+    {
+        m_learningRatesParam = learningRatesPerMB;
+        m_learningRatesSpecifiedForMBSize = m_mbSize;
+    }
 
     if ((int) (momentumPerSample.size() > 0) + (int) (momentumPerMB.size() > 0) + (int) (momentumAsTimeConstant.size() > 0) > 1)
-        {
-            InvalidArgument("You specified more than one of momentumPerSample, momentumPerMB, and momentumAsTimeConstant. Please only specify one.");
-        }
+    {
+        InvalidArgument("You specified more than one of momentumPerSample, momentumPerMB, and momentumAsTimeConstant. Please only specify one.");
+    }
 
     if (momentumPerSample.size() > 0) // note: noone would ever use this; use momentumAsTimeConstant instead
+    {
+        m_momentumParam = momentumPerSample;
+        m_momentumSpecifiedForMBSize = intargvector(L"1");
+    }
+    else if (momentumAsTimeConstant.size() > 0)
+    {
+        vector<float> momentumPerSampleVec;
+        for (int i = 0; i < momentumAsTimeConstant.size(); i++)
         {
-            m_momentumParam = momentumPerSample;
-            m_momentumSpecifiedForMBSize = intargvector(L"1");
-        }
-        else if (momentumAsTimeConstant.size() > 0)
-        {
-            vector<float> momentumPerSampleVec;
-            for (int i = 0; i < momentumAsTimeConstant.size(); i++)
-            {
-                double momTC = momentumAsTimeConstant[i];
-                double momPS = momTC == 0.0 ? 0 : exp(-1.0 / momTC);
+            double momTC = momentumAsTimeConstant[i];
+            double momPS = momTC == 0.0 ? 0 : exp(-1.0 / momTC);
             momentumPerSampleVec.push_back((float) momPS);
-            }
-            m_momentumParam = momentumPerSampleVec;
-            m_momentumSpecifiedForMBSize = intargvector(L"1");
-        }
-        else if (momentumPerMB.size() > 0)
-        {
-            m_momentumParam = momentumPerMB;
-            m_momentumSpecifiedForMBSize = m_mbSize;
         }
+        m_momentumParam = momentumPerSampleVec;
+        m_momentumSpecifiedForMBSize = intargvector(L"1");
+    }
+    else if (momentumPerMB.size() > 0)
+    {
+        m_momentumParam = momentumPerMB;
+        m_momentumSpecifiedForMBSize = m_mbSize;
+    }
     else // default: momentumPerMB = 0.9 per MB
-        {
-            m_momentumParam = floatargvector(L"0.9");
-            m_momentumSpecifiedForMBSize = m_mbSize;
-        }
-        m_useNesterovMomentum = useNesterovMomentum; 
+    {
+        m_momentumParam = floatargvector(L"0.9");
+        m_momentumSpecifiedForMBSize = m_mbSize;
+    }
+    m_useNesterovMomentum = useNesterovMomentum;
 
-        for (int i = 0; i < m_momentumParam.size(); i++)
+    for (int i = 0; i < m_momentumParam.size(); i++)
+    {
+        if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)
         {
-            if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)
-            {
-                InvalidArgument("Momentum parameter must be in [0, 1).");
-        }
+            InvalidArgument("Momentum parameter must be in [0, 1).");
         }
+    }
 
-        if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor < 1)
-        {
-            InvalidArgument("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
-        }
+    if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor < 1)
+    {
+        InvalidArgument("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
+    }
 
-        for (size_t i = 0; i < m_dropoutRates.size(); i++)
+    for (size_t i = 0; i < m_dropoutRates.size(); i++)
+    {
+        if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
         {
-            if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
-            {
-                InvalidArgument("dropoutRate must be >= 0 and < 1.");
-            }
+            InvalidArgument("dropoutRate must be >= 0 and < 1.");
         }
+    }
 
-        if (m_adaptationRegWeight > 1 || m_adaptationRegWeight < 0)
-            InvalidArgument("adaptationRegWeight must be in [0 1]");
+    if (m_adaptationRegWeight > 1 || m_adaptationRegWeight < 0)
+        InvalidArgument("adaptationRegWeight must be in [0 1]");
 
-        m_minLearnRate = 1e-9f;
+    m_minLearnRate = 1e-9f;
 
-        m_needAdaptRegularization = false;
+    m_needAdaptRegularization = false;
 
-        // BUGBUG: these are not passed to Init()
-        m_doUnitTest = configSGD(L"unitTest", false);
+    // BUGBUG: these are not passed to Init()
+    m_doUnitTest = configSGD(L"unitTest", false);
 
-        // parallel training
-        m_parallelizationMethod = ParallelizationMethod::None;
-        m_numGradientBits = 32;
-        m_zeroThresholdFor1Bit = true;
-        m_bufferedAsyncGradientAggregation = false;
-        m_enableDistributedMBReading = false;
-        m_parallelizationStartEpochNum = 0;
-        m_nFramesBetweenMASync = 40000; // default 40k frames 
+    // parallel training
+    m_parallelizationMethod = ParallelizationMethod::None;
+    m_numGradientBits = 32;
+    m_zeroThresholdFor1Bit = true;
+    m_bufferedAsyncGradientAggregation = false;
+    m_enableDistributedMBReading = false;
+    m_parallelizationStartEpochNum = 0;
+    m_nFramesBetweenMASync = 40000; // default 40k frames
 
-        if ((g_mpi != nullptr) && configSGD.Exists(L"ParallelTrain"))
-        {
+    if ((g_mpi != nullptr) && configSGD.Exists(L"ParallelTrain"))
+    {
         const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
-            m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
+        m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
         m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int) 1) - 1; // Epoch numbers internally are 0 based
-            m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
+        m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
         m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int) 0);
 
-            if (configParallelTrain.Exists(L"DataParallelSGD"))
-            {
+        if (configParallelTrain.Exists(L"DataParallelSGD"))
+        {
             const ConfigRecordType& configDataParallelSGD(configParallelTrain(L"DataParallelSGD", ConfigRecordType::Record()));
-                size_t defaultGradientBits = 8 * sizeofElemType;
-                m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
-                m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
-                m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
-                if ((m_numGradientBits < 1) || (m_numGradientBits > (8 * sizeofElemType)))
-                {
-                    InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
-                }
+            size_t defaultGradientBits = 8 * sizeofElemType;
+            m_numGradientBits = configDataParallelSGD(L"gradientBits", defaultGradientBits);
+            m_zeroThresholdFor1Bit = configDataParallelSGD(L"useZeroThresholdFor1BitQuantization", true);
+            m_bufferedAsyncGradientAggregation = configDataParallelSGD(L"useBufferedAsyncGradientAggregation", false);
+            if ((m_numGradientBits < 1) || (m_numGradientBits > (8 * sizeofElemType)))
+            {
+                InvalidArgument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
             }
+        }
 
         if (configParallelTrain.Exists(L"ModelAveragingSGD"))
-            {
+        {
             const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
             m_nFramesBetweenMASync = configMASGD(L"syncFrequencyInFrames", (size_t) 40000);
-            }
         }
     }
+}
 
-    static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
-    {
-        wstring precision = configp->Get(L"precision");
-        if (precision == L"float")
-            return sizeof(float);
-        else if (precision == L"double")
-            return sizeof(double);
-        else
-            RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
-    }
+static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
+{
+    wstring precision = configp->Get(L"precision");
+    if (precision == L"float")
+        return sizeof(float);
+    else if (precision == L"double")
+        return sizeof(double);
+    else
+        RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
+}
 
 SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
     : SGDParams(*configp, GetSizeOfPrecision(configp))
 {
 }
 
-    // register SGD<> with the ScriptableObject system
-    ScriptableObjects::ConfigurableRuntimeTypeRegister::AddFloatDouble<SGD<float>, SGD<double>> registerSGDOptimizer(L"SGDOptimizer");
+// register SGD<> with the ScriptableObject system
+ScriptableObjects::ConfigurableRuntimeTypeRegister::AddFloatDouble<SGD<float>, SGD<double>> registerSGDOptimizer(L"SGDOptimizer");
 } } }
diff --git a/Tests/UnitTests/FileTest/FileTest.cpp b/Tests/UnitTests/FileTest/FileTest.cpp
index a4bff8545d68..79539c0e5a6f 100644
--- a/Tests/UnitTests/FileTest/FileTest.cpp
+++ b/Tests/UnitTests/FileTest/FileTest.cpp
@@ -236,6 +236,4 @@ File& operator>>(File& stream, FileTest& test)
     stream.GetMarker(fileMarkerEndSection, string("endFileTest"));
     return stream;
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Tests/UnitTests/FileTest/FileTest.h b/Tests/UnitTests/FileTest/FileTest.h
index 155e9044f2b5..065d2e9a1871 100644
--- a/Tests/UnitTests/FileTest/FileTest.h
+++ b/Tests/UnitTests/FileTest/FileTest.h
@@ -39,6 +39,4 @@ class FileTest
 // operator overloading
 File& operator>>(File& stream, FileTest& test);
 File& operator<<(File& stream, FileTest& test);
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Tests/UnitTests/MathTests/DebugUtil.cpp b/Tests/UnitTests/MathTests/DebugUtil.cpp
index 722e0475a783..b8dd82ecf7f4 100644
--- a/Tests/UnitTests/MathTests/DebugUtil.cpp
+++ b/Tests/UnitTests/MathTests/DebugUtil.cpp
@@ -66,6 +66,4 @@ void DebugUtil::PrintCallStack()
     free(symbolInfo);
 #endif // _WIN32
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file
diff --git a/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h b/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
index 3a8c0dc61a4e..e9b5848966fe 100644
--- a/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
+++ b/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
@@ -266,6 +266,4 @@ struct ReaderFixture
     }
 };
 }
-}
-}
-}
\ No newline at end of file
+} } }
\ No newline at end of file