From 19f6653a267a7e191448c520e4b877d166f7601c Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 08:08:23 -0700
Subject: [PATCH 01/44] (fixed another #include order in MatrixQuantizerGPU.h)

---
 CNTK.sln                       | 26 +++++++++-----------------
 Math/Math/MatrixQuantizerGPU.h |  2 +-
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/CNTK.sln b/CNTK.sln
index 8c6f069f492e..4e364b16dab1 100644
--- a/CNTK.sln
+++ b/CNTK.sln
@@ -1,7 +1,7 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
-VisualStudioVersion = 12.0.30324.0
+VisualStudioVersion = 12.0.21005.1
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathDll", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
@@ -171,9 +171,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CheckInSuites", "CheckInSui
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SparsePCReader", "DataReader\SparsePCReader\SparsePCReader.vcxproj", "{CE429AA2-3778-4619-8FD1-49BA3B81197B}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Speech", "Speech", "{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}"
 	ProjectSection(SolutionItems) = preProject
@@ -335,10 +332,13 @@ Global
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(NestedProjects) = preSolution
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
@@ -348,30 +348,22 @@ Global
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{3ED0465D-23E7-4855-9694-F788717B6533} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{065AF55D-AF02-448B-BFCD-52619FDA4BD0} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{3ED0465D-23E7-4855-9694-F788717B6533} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 		{98D2C32B-0C1F-4E19-A626-65F7BA4600CF} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{EA67F51F-1FE8-462D-9F3E-01161685AD59} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{63024704-A2D7-497E-AD4B-5C10C6AA1374} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{F9BEB27E-8AF5-464E-8D45-0000D5AFA2D3} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
 		{889C1CCF-92B3-450B-B00D-FC9A9D5BE464} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
-		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{4BBF2950-3DBD-469A-AD57-6CACBEBAF541} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{5F733BBA-FE83-4668-8F83-8B0E78A36619} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{19EE975B-232D-49F0-94C7-6F1C6424FB53} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
-		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{6D1353D6-F196-466F-B886-F16D48759B20} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
diff --git a/Math/Math/MatrixQuantizerGPU.h b/Math/Math/MatrixQuantizerGPU.h
index 058b75eaf132..9247bcbf1ce5 100644
--- a/Math/Math/MatrixQuantizerGPU.h
+++ b/Math/Math/MatrixQuantizerGPU.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "QuantizedMatrix.h"
 #include "MatrixQuantizer.h"
+#include "QuantizedMatrix.h"
 #include "ColumnQuantizer.h"
 #include "GPUMatrix.h"
 #ifndef CPUONLY

From 3451b660b652d6c432d511ff5bfd286663965edc Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 09:48:25 -0700
Subject: [PATCH 02/44] renamed MBLayout::IsEmpty() to IsAllNone() (a unique
 name), to be able to identify easily where it is used

---
 .../CNTKComputationNetworkLib/ComputationNetwork.h         | 3 ++-
 .../CNTKComputationNetworkLib/ComputationNode.h            | 2 +-
 .../CNTKComputationNetworkLib/TrainingCriterionNodes.h     | 3 ++-
 Math/Math/Matrix.h                                         | 7 ++++---
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 41dd769b2089..dfb819948d5e 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -238,9 +238,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // TODO: describe what this function does
     //this is a temp solution since some nodes such as plus can be just aggregate of two scalar values 
     //in which case the packing info is not available (and not meaningful) for them
+    // TODO: Does this belong into MBLayout?
     size_t GetNumSamplesWithLabel(const size_t numAllSamples)
     {
-        if (!m_pMBLayout->IsEmpty())
+        if (!m_pMBLayout->IsAllNone())
         {
             size_t numTimeSteps = m_pMBLayout->GetNumFrames();
             size_t numSequences = m_pMBLayout->GetNumStreams();
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 56f35641ab62..61e3441d1553 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -900,7 +900,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 
-            if (m_pMBLayout && !m_pMBLayout->IsEmpty())
+            if (m_pMBLayout && !m_pMBLayout->IsAllNone())
             {
                 size_t nT = matrixToBeMasked.GetNumCols();
                 size_t nS = m_pMBLayout->GetNumStreams();
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 84b325617eb9..1e99bfc168c0 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -1090,11 +1090,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /**
         reset to error signals to 0 for any elements without labels
         */
+        // TODO: This has overlap with ComputationNode::MaskToZeroWhenLabelAndFeatureMissing(), should call that instead.
         bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t t)
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 
-            if (m_pMBLayout && !m_pMBLayout->IsEmpty())
+            if (m_pMBLayout && !m_pMBLayout->IsAllNone())
             {
                 // 't' is not a time but rather a column index that encodes (time stamp, stream)
                 size_t nS = m_pMBLayout->GetNumStreams();
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index eb5733b3f1a7..cef4d3143489 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -213,7 +213,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
         ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
         ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
-       
+
+        // TODO: should Reshape() return a new Matrix object that contains a reference to the original?
         void Reshape(const size_t numRows, const size_t numCols);
         void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true);  //by default we only reallocate if need to grow        
         /// similarly to the repmat operation in matlab or octave
@@ -597,10 +598,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // these accessors were for now just collected from actual usage; need to be cleaned up once this compiles again
         size_t GetNumFrames()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
-        size_t GetNumStreams() const { return IsEmpty() ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
+        size_t GetNumStreams() const { return IsAllNone() ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
         size_t GetSize() const { validate(); return m_minibatchPackingFlags.size(); }
         // ^^ TODO: add a check whether Size() == GetNumFrames(); it really should, unless I misunderstood
-        bool IsEmpty() const { validate(); return m_minibatchPackingFlags.empty(); }
+        bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
 #if 0   // we have this pattern often:
         // TODO: mbSize and #slices must also move into MBLayout 
         evalnet->SetActualMiniBatchSize(mbSize);

From 4fe273dc3dc5fb7b514c3d9f67032e3a4d7205fd Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 10:01:06 -0700
Subject: [PATCH 03/44] (changed some more ComputationNode<ElemType>:: to
 Base::)

---
 .../CNTKComputationNetworkLib/ComputationNode.h        |  6 ++----
 .../CNTKComputationNetworkLib/RecurrentNodes.h         |  4 ++--
 .../CNTKComputationNetworkLib/TrainingCriterionNodes.h | 10 +++++-----
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 61e3441d1553..45abce8277c6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -896,9 +896,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /**
         reset to error signals to 0 for any elements without labele
         */
-        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t timeIdxInSeq=(size_t)-1)
+        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t timeIdxInSeq=(size_t)-1) const
         {
-            bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
+            bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
 
             if (m_pMBLayout && !m_pMBLayout->IsAllNone())
             {
@@ -908,8 +908,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (m_pMBLayout->GetSize() != nT / nS)
                     LogicError("MaskToZeroWhenLabelAndFeatureMissing: m_pMBLayout->m_minibatchPackingFlags should have one element for each timestep of all streams. Check feature reader. ");
 
-                //Matrix<ElemType> colSeg(m_pMBLayout->m_sentenceBoundaryFlags.GetDeviceId());
-
                 size_t startT = (timeIdxInSeq == (size_t)-1) ? 0 : timeIdxInSeq * nS;       // TODO: misnomer; startT, endT, and utt_t are not times but columns in the packed matrix
                 size_t endT = (timeIdxInSeq == (size_t)-1) ? nT : timeIdxInSeq * nS + nS;
                 for (size_t utt_t = startT; utt_t < endT; utt_t += nS)
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index ed1fdfc28d15..c038ecc2e31b 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -1105,7 +1105,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Matrix<ElemType>::Multiply(state.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() - nsamples, nsamples), false, colSeg, false, newPrevState);
             }
 
-            ComputationNode<ElemType>::SetToInitStateValueForResetSeg(sentenceBegin->ColumnSlice(utt_t, 1), nStream, initStateValue, newPrevState);
+            Base::SetToInitStateValueForResetSeg(sentenceBegin->ColumnSlice(utt_t, 1), nStream, initStateValue, newPrevState);
 
             slicePrevOutput.ColumnSlice(0, nsamples).SetValue(newPrevOutput);
             slicePrevState.ColumnSlice(0, nsamples).SetValue(newPrevState);
@@ -1354,7 +1354,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 //boundary.ColumnSlice(0, 1).SetValue(((int) MinibatchPackingFlags::SequenceStart));
                 //minibatchPackingFlags[1] = MinibatchPackingFlags::SequenceStart;
                 pMBLayout->Set(0, 1, MinibatchPackingFlags::SequenceStart); // TODO: strange--start at frame[1] instead of [0]?
-                ComputationNode<ElemType>::ResetBound(pMBLayout);
+                Base::ResetBound(pMBLayout);
 
                 f0 = Inputs(0)->FunctionValues();
                 f1 = Inputs(1)->FunctionValues();
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 1e99bfc168c0..67e0613a7dc9 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -174,7 +174,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             else
             {
                 ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
-                ComputationNode<ElemType>::MaskToZeroWhenLabelAndFeatureMissing(Inputs(inputIndex)->GradientValues());
+                Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(inputIndex)->GradientValues());
             }
         }
 
@@ -503,7 +503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void EvaluateThisNode()  
         {
-            ComputationNode<ElemType>::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
+            Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
             EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
         }
 
@@ -599,7 +599,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void EvaluateThisNode()  
         {
-            ComputationNode<ElemType>::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
+            Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
             EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
         }
 
@@ -1090,8 +1090,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /**
         reset to error signals to 0 for any elements without labels
         */
-        // TODO: This has overlap with ComputationNode::MaskToZeroWhenLabelAndFeatureMissing(), should call that instead.
-        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t t)
+        // TODO: This has overlap with ComputationNode::MaskToZeroWhenLabelAndFeatureMissing(), should call that instead. Note: This one does only one stream, while Base:: one does all streams.
+        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t t) const
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 

From 960e0fe64411d0880f88347d43476f1be79958a8 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Fri, 18 Sep 2015 12:15:19 -0700
Subject: [PATCH 04/44] Add Kaldi sources to the VS solution

---
 CNTK.sln | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/CNTK.sln b/CNTK.sln
index 8c6f069f492e..20b96006ee31 100644
--- a/CNTK.sln
+++ b/CNTK.sln
@@ -1,7 +1,7 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
-VisualStudioVersion = 12.0.30324.0
+VisualStudioVersion = 12.0.31101.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathDll", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
@@ -255,6 +255,90 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DoublePrecision", "DoublePr
 		Tests\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml = Tests\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "KaldiReader", "KaldiReader", "{3E9C89B1-C045-4F42-92B2-F9FFFFC2DBD4}"
+	ProjectSection(SolutionItems) = preProject
+		DataReader\KaldiReader\basetypes.h = DataReader\KaldiReader\basetypes.h
+		DataReader\KaldiReader\basetypes.old.h = DataReader\KaldiReader\basetypes.old.h
+		DataReader\KaldiReader\biggrowablevectors.h = DataReader\KaldiReader\biggrowablevectors.h
+		DataReader\KaldiReader\chunkevalsource.h = DataReader\KaldiReader\chunkevalsource.h
+		DataReader\KaldiReader\DataReader.cpp = DataReader\KaldiReader\DataReader.cpp
+		DataReader\KaldiReader\DataWriter.cpp = DataReader\KaldiReader\DataWriter.cpp
+		DataReader\KaldiReader\dllmain.cpp = DataReader\KaldiReader\dllmain.cpp
+		DataReader\KaldiReader\fileutil.cpp = DataReader\KaldiReader\fileutil.cpp
+		DataReader\KaldiReader\fileutil.h = DataReader\KaldiReader\fileutil.h
+		DataReader\KaldiReader\fileutil.old.h = DataReader\KaldiReader\fileutil.old.h
+		DataReader\KaldiReader\htkfeatio.h = DataReader\KaldiReader\htkfeatio.h
+		DataReader\KaldiReader\HTKMLFReader.cpp = DataReader\KaldiReader\HTKMLFReader.cpp
+		DataReader\KaldiReader\HTKMLFReader.h = DataReader\KaldiReader\HTKMLFReader.h
+		DataReader\KaldiReader\HTKMLFWriter.cpp = DataReader\KaldiReader\HTKMLFWriter.cpp
+		DataReader\KaldiReader\HTKMLFWriter.h = DataReader\KaldiReader\HTKMLFWriter.h
+		DataReader\KaldiReader\latticearchive.cpp = DataReader\KaldiReader\latticearchive.cpp
+		DataReader\KaldiReader\latticearchive.h = DataReader\KaldiReader\latticearchive.h
+		DataReader\KaldiReader\latticestorage.h = DataReader\KaldiReader\latticestorage.h
+		DataReader\KaldiReader\minibatchiterator.h = DataReader\KaldiReader\minibatchiterator.h
+		DataReader\KaldiReader\minibatchsourcehelpers.h = DataReader\KaldiReader\minibatchsourcehelpers.h
+		DataReader\KaldiReader\msra_mgram.h = DataReader\KaldiReader\msra_mgram.h
+		DataReader\KaldiReader\numahelpers.h = DataReader\KaldiReader\numahelpers.h
+		DataReader\KaldiReader\pplhelpers.h = DataReader\KaldiReader\pplhelpers.h
+		DataReader\KaldiReader\readaheadsource.h = DataReader\KaldiReader\readaheadsource.h
+		DataReader\KaldiReader\rollingwindowsource.h = DataReader\KaldiReader\rollingwindowsource.h
+		DataReader\KaldiReader\simple_checked_arrays.h = DataReader\KaldiReader\simple_checked_arrays.h
+		DataReader\KaldiReader\simplesenonehmm.h = DataReader\KaldiReader\simplesenonehmm.h
+		DataReader\KaldiReader\simplethread.h = DataReader\KaldiReader\simplethread.h
+		DataReader\KaldiReader\ssefloat4.h = DataReader\KaldiReader\ssefloat4.h
+		DataReader\KaldiReader\ssematrix.h = DataReader\KaldiReader\ssematrix.h
+		DataReader\KaldiReader\stdafx.cpp = DataReader\KaldiReader\stdafx.cpp
+		DataReader\KaldiReader\stdafx.h = DataReader\KaldiReader\stdafx.h
+		DataReader\KaldiReader\targetver.h = DataReader\KaldiReader\targetver.h
+		DataReader\KaldiReader\utterancesource.h = DataReader\KaldiReader\utterancesource.h
+		DataReader\KaldiReader\utterancesourcemulti.h = DataReader\KaldiReader\utterancesourcemulti.h
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Kaldi2Reader", "Kaldi2Reader", "{C70E1572-20FF-496C-A0A9-10AA6755A07C}"
+	ProjectSection(SolutionItems) = preProject
+		DataReader\Kaldi2Reader\basetypes.h = DataReader\Kaldi2Reader\basetypes.h
+		DataReader\Kaldi2Reader\biggrowablevectors.h = DataReader\Kaldi2Reader\biggrowablevectors.h
+		DataReader\Kaldi2Reader\chunkevalsource.h = DataReader\Kaldi2Reader\chunkevalsource.h
+		DataReader\Kaldi2Reader\DataReader.cpp = DataReader\Kaldi2Reader\DataReader.cpp
+		DataReader\Kaldi2Reader\DataWriter.cpp = DataReader\Kaldi2Reader\DataWriter.cpp
+		DataReader\Kaldi2Reader\dllmain.cpp = DataReader\Kaldi2Reader\dllmain.cpp
+		DataReader\Kaldi2Reader\DOCUMENTATION.txt = DataReader\Kaldi2Reader\DOCUMENTATION.txt
+		DataReader\Kaldi2Reader\fileutil.cpp = DataReader\Kaldi2Reader\fileutil.cpp
+		DataReader\Kaldi2Reader\fileutil.h = DataReader\Kaldi2Reader\fileutil.h
+		DataReader\Kaldi2Reader\htkfeatio.h = DataReader\Kaldi2Reader\htkfeatio.h
+		DataReader\Kaldi2Reader\htkfeatio_utils.h = DataReader\Kaldi2Reader\htkfeatio_utils.h
+		DataReader\Kaldi2Reader\HTKMLFReader.cpp = DataReader\Kaldi2Reader\HTKMLFReader.cpp
+		DataReader\Kaldi2Reader\HTKMLFReader.h = DataReader\Kaldi2Reader\HTKMLFReader.h
+		DataReader\Kaldi2Reader\HTKMLFWriter.cpp = DataReader\Kaldi2Reader\HTKMLFWriter.cpp
+		DataReader\Kaldi2Reader\HTKMLFWriter.h = DataReader\Kaldi2Reader\HTKMLFWriter.h
+		DataReader\Kaldi2Reader\kaldi.h = DataReader\Kaldi2Reader\kaldi.h
+		DataReader\Kaldi2Reader\KaldiSequenceTrainingDerivative.cpp = DataReader\Kaldi2Reader\KaldiSequenceTrainingDerivative.cpp
+		DataReader\Kaldi2Reader\KaldiSequenceTrainingDerivative.h = DataReader\Kaldi2Reader\KaldiSequenceTrainingDerivative.h
+		DataReader\Kaldi2Reader\latticearchive.cpp = DataReader\Kaldi2Reader\latticearchive.cpp
+		DataReader\Kaldi2Reader\latticearchive.h = DataReader\Kaldi2Reader\latticearchive.h
+		DataReader\Kaldi2Reader\latticestorage.h = DataReader\Kaldi2Reader\latticestorage.h
+		DataReader\Kaldi2Reader\minibatchiterator.h = DataReader\Kaldi2Reader\minibatchiterator.h
+		DataReader\Kaldi2Reader\minibatchsourcehelpers.h = DataReader\Kaldi2Reader\minibatchsourcehelpers.h
+		DataReader\Kaldi2Reader\msra_mgram.h = DataReader\Kaldi2Reader\msra_mgram.h
+		DataReader\Kaldi2Reader\notes.txt = DataReader\Kaldi2Reader\notes.txt
+		DataReader\Kaldi2Reader\numahelpers.h = DataReader\Kaldi2Reader\numahelpers.h
+		DataReader\Kaldi2Reader\pplhelpers.h = DataReader\Kaldi2Reader\pplhelpers.h
+		DataReader\Kaldi2Reader\readaheadsource.h = DataReader\Kaldi2Reader\readaheadsource.h
+		DataReader\Kaldi2Reader\rollingwindowsource.h = DataReader\Kaldi2Reader\rollingwindowsource.h
+		DataReader\Kaldi2Reader\simple_checked_arrays.h = DataReader\Kaldi2Reader\simple_checked_arrays.h
+		DataReader\Kaldi2Reader\simplesenonehmm.h = DataReader\Kaldi2Reader\simplesenonehmm.h
+		DataReader\Kaldi2Reader\simplethread.h = DataReader\Kaldi2Reader\simplethread.h
+		DataReader\Kaldi2Reader\ssefloat4.h = DataReader\Kaldi2Reader\ssefloat4.h
+		DataReader\Kaldi2Reader\ssematrix.h = DataReader\Kaldi2Reader\ssematrix.h
+		DataReader\Kaldi2Reader\stdafx.cpp = DataReader\Kaldi2Reader\stdafx.cpp
+		DataReader\Kaldi2Reader\stdafx.h = DataReader\Kaldi2Reader\stdafx.h
+		DataReader\Kaldi2Reader\targetver.h = DataReader\Kaldi2Reader\targetver.h
+		DataReader\Kaldi2Reader\UtteranceDerivativeBuffer.cpp = DataReader\Kaldi2Reader\UtteranceDerivativeBuffer.cpp
+		DataReader\Kaldi2Reader\UtteranceDerivativeBuffer.h = DataReader\Kaldi2Reader\UtteranceDerivativeBuffer.h
+		DataReader\Kaldi2Reader\UtteranceDerivativeComputationInterface.h = DataReader\Kaldi2Reader\UtteranceDerivativeComputationInterface.h
+		DataReader\Kaldi2Reader\utterancesourcemulti.h = DataReader\Kaldi2Reader\utterancesourcemulti.h
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -376,5 +460,7 @@ Global
 		{B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
 		{A4884465-CFBB-4A64-A9DE-690E1A63EF7E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}
+		{3E9C89B1-C045-4F42-92B2-F9FFFFC2DBD4} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{C70E1572-20FF-496C-A0A9-10AA6755A07C} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 	EndGlobalSection
 EndGlobal

From af7409309cea8b2a065de4eb7fc23df9c87ed89e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 16:04:20 -0700
Subject: [PATCH 05/44] temporarily renamed m_nbrSlicesInEachRecurrentIteration
 to the distinct m_nbrSlicesInEachRecurrentIterationx, as this is supposed to
 disappear soon; GetActualMBSize() is now const, and renamem to
 DetermineActualMBSizeFromFeatures()

---
 .../ComputationNetwork.cpp                    |  2 +-
 .../ComputationNetwork.h                      | 58 ++++++++++---------
 MachineLearning/CNTKSGDLib/MultiNetworksSGD.h |  9 ++-
 MachineLearning/CNTKSGDLib/SGD.cpp            |  7 ++-
 MachineLearning/CNTKSGDLib/SimpleEvaluator.h  | 29 ++++++----
 .../CNTKSGDLib/SimpleOutputWriter.h           |  4 +-
 Math/Math/Matrix.h                            |  8 ++-
 7 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index c4492746c1d8..d5b08827e64e 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -193,7 +193,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
 
-        size_t actualMBSize = GetActualMBSize();
+        size_t actualMBSize = DetermineActualMBSizeFromFeatures();
         SetActualMiniBatchSize(actualMBSize);
 
         if (requireValidation)
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index dfb819948d5e..4454f83499fc 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -80,7 +80,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         m_randomSeedOffset = 0;
         m_actMiniBSize = 0;
         SetDeviceId(deviceId);
-        m_nbrSlicesInEachRecurrentIteration = 1;
+        m_nbrSlicesInEachRecurrentIterationx = 1;
     }
 
     virtual ~ComputationNetwork()
@@ -199,7 +199,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // evaluation
     // -----------------------------------------------------------------------
 
-    size_t GetActualMBSize()
+    // determine the actual MB size from the feature nodes
+    // This returns max number of columns over the feature nodes.
+    // Note that if we have multiple slices, MB size != #frames.
+    size_t DetermineActualMBSizeFromFeatures() const
     {
         size_t actualMBSize = 0;
 
@@ -555,7 +558,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
                 (*nodeIter)->SetFunctionAndGradientSize(m_actMiniBSize);
 
-            int iMBSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIteration;
+            int iMBSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIterationx;
 
             if (m_recurrentInfo[iLoopId].m_isForwardLoop)
             {
@@ -595,9 +598,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     {
         // checks that will disappear once we complete the refactoring. If this passes for a while, we will eliminate one
         // If this fails, comment this out (it is safe) and tell fseide@microsoft.com.
-        if (m_pMBLayout && m_nbrSlicesInEachRecurrentIteration != m_pMBLayout->GetNumStreams())
+        if (m_nbrSlicesInEachRecurrentIterationx != m_pMBLayout->GetNumStreams())
             LogicError("Evaluate: detected that m_nbrSlicesInEachRecurrentIteration != m_pMBLayout->GetNumStreams()");
-        if (m_pMBLayout && m_pMBLayout->GetNumFrames() != m_pMBLayout->GetSize())
+        if (m_pMBLayout->GetNumFrames() != m_pMBLayout->GetSize())
             LogicError("Evaluate: detected that m_pMBLayout->GetNumFrames() != m_pMBLayout->GetSize()");
 
         // prepare to compute with the subnetwork that this rootNode depends on, including
@@ -622,7 +625,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
             // TODO: nbrSlices set once to the same value for all nodes each evaluation--is it ever changed later?
-            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIteration);
+            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx);
             if ((*nodeIter)->ReqMultiSeqHandling())
                 (*nodeIter)->ResetBound(m_pMBLayout);
         }
@@ -650,7 +653,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         }
     }
 
-    void SetActualMiniBatchSize(const size_t aSize, vector<ComputationNodeBasePtr>* featNodes = nullptr)
+    // resize entire network to handle a given MB size
+    // TODO: Is this always called with the result of DetermineActualMBSizeFromFeatures()? Why would it ever not?
+    void SetActualMiniBatchSize(const size_t aSize)
     {
         m_actMiniBSize = (int) aSize;
 
@@ -664,24 +669,22 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         for (int i = 0; i < m_recurrentInfo.size(); i++)
             for (auto nodeIter = m_recurrentInfo[i].m_recurrentNodes.begin(); nodeIter != m_recurrentInfo[i].m_recurrentNodes.end(); nodeIter++)
                 (*nodeIter)->SetFunctionAndGradientSize(m_actMiniBSize);
-
-        if (featNodes)
-        {
-            for (auto ptr = featNodes->begin(); ptr != featNodes->end(); ptr++)
-            {
-                size_t nr = (*ptr)->GetNumRows();
-                (*ptr)->Resize(nr, aSize);
-            }
-        }
     }
 
     // GetMaxMBSize - Get the maximum minibatch size that will be seen in a training run
-    // returns the result from SetActualMiniBatchSize(). Note GetActualMBSize() also exists but returns a value derived from the inputs dimensions
+    // returns the result from SetActualMiniBatchSize(). Note DetermineActualMBSizeFromFeatures() also exists but returns a value derived from the inputs dimensions
     size_t GetMaxMBSize() { return m_actMiniBSize; }
 
+    // always called in this pattern:
+#if 0
+    evalnet->SetActualMiniBatchSize(mbSize);
+    evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
+    dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
+    // well... most of the time. Not in TrainOneEpoch().
+#endif
     void SetActualNbrSlicesInEachRecurentIteration(const size_t aSize)
     {
-        m_nbrSlicesInEachRecurrentIteration = aSize;
+        m_nbrSlicesInEachRecurrentIterationx = aSize;
     }
 
     void ComputeGradientLoop(std::list<ComputationNodeBasePtr>& /*allNodes*/, const ComputationNodeBasePtr startNode)
@@ -692,14 +695,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         {
             if (m_recurrentInfo[iLoopId].m_completedGradient == false)
             {
-                int mbSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIteration;
+                int mbSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIterationx;
                 if (m_recurrentInfo[iLoopId].m_isForwardLoop)
                 {
                     for (int timeIndex = mbSize - 1; timeIndex >= 0; timeIndex--)
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIteration); // TODO: move to FrameRange object
+                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx); // TODO: move to FrameRange object
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -710,7 +713,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIteration);
+                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx);
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -856,9 +859,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         return m_learnableParameters[rootNode];
     }
 
-    inline std::vector<ComputationNodeBasePtr> & FeatureNodes()        { return m_features;      }
-    inline std::vector<ComputationNodeBasePtr> & LabelNodes()          { return m_labels;        }
-    inline std::vector<ComputationNodeBasePtr> & FinalCriterionNodes() { return m_finalCriteria; }
+    inline       std::vector<ComputationNodeBasePtr> & FeatureNodes()        { return m_features; }
+    inline const std::vector<ComputationNodeBasePtr> & FeatureNodes() const  { return m_features; }
+    inline       std::vector<ComputationNodeBasePtr> & LabelNodes()          { return m_labels; }
+    inline       std::vector<ComputationNodeBasePtr> & FinalCriterionNodes() { return m_finalCriteria; }
 
     inline std::vector<ComputationNodeBasePtr> CriterionNodesFrom(const wstring & criterionNodeName)
     {
@@ -1101,7 +1105,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                 if (!allowFragment)
                     FormRecurrentLoops(node);
                 PrintComputationTree(node, false);
-                size_t actualMBSize = this->GetActualMBSize();
+                size_t actualMBSize = this->DetermineActualMBSizeFromFeatures();
                 this->SetActualMiniBatchSize(actualMBSize);
                 ValidateSubNetwork(node);
             }
@@ -1273,7 +1277,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             {
                 if (!allowFragment)
                     FormRecurrentLoops(node);
-                size_t actualMBSize = this->GetActualMBSize();
+                size_t actualMBSize = this->DetermineActualMBSizeFromFeatures();
                 this->SetActualMiniBatchSize(actualMBSize);
                 if (!UnitTest(node))
                     vErrors.push_back(node->NodeName().c_str());
@@ -1559,7 +1563,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     MBLayoutPtr m_pMBLayout;
 
     int m_actMiniBSize;
-    size_t m_nbrSlicesInEachRecurrentIteration;
+    size_t m_nbrSlicesInEachRecurrentIterationx;
 
     // main node holder
     std::map<const std::wstring, ComputationNodeBasePtr, nocase_compare> m_nameToNodeMap;   // [name] -> node; this is the main container that holds this networks' nodes
diff --git a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
index 9199c0a0254a..499230767b45 100644
--- a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
+++ b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
@@ -880,7 +880,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (!bContinueDecoding)
                     break;
 
-                size_t actualMBSize = decoderNet->GetActualMBSize();
+                size_t actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
                 if (actualMBSize == 0)
                     LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read");
 
@@ -1157,7 +1157,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType>& localEpochEvalErrors
             )
         {
-            size_t actualMBSize = encoderNet->GetActualMBSize();
+            size_t actualMBSize = encoderNet->DetermineActualMBSizeFromFeatures();
 
             encoderNet->SetActualMiniBatchSize(actualMBSize);
             encoderNet->SetActualNbrSlicesInEachRecurentIteration(encoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
@@ -1165,13 +1165,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             encoderNet->Evaluate(encoderEvaluationNodes[0]);
 
-            actualMBSize = decoderNet->GetActualMBSize();
+            actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
 
             decoderNet->SetActualMiniBatchSize(actualMBSize);
             decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
-
-            /// not the sentence begining, because the initial hidden layer activity is from the encoder network
             decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
+            /// not the sentence begining, because the initial hidden layer activity is from the encoder network
 
             if (decoderCriterionNodes.size() == 0 && decoderEvaluationNodes.size() == 0)
             {
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index c555eae333d5..9775718f0de4 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -1306,7 +1306,7 @@ template<class ElemType>
             ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
             ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-            size_t actualMBSize = net.GetActualMBSize();
+            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
             net.SetActualMiniBatchSize(actualMBSize);
             net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
@@ -1766,7 +1766,7 @@ template<class ElemType>
             if (outputNodes.empty())
                 LogicError("no output node was found.");
 
-            size_t actualMBSize = net.GetActualMBSize();
+            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
             net.SetActualMiniBatchSize(actualMBSize);
             net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
@@ -1943,7 +1943,7 @@ template<class ElemType>
                     }
                 }
 
-                actualMBSize = net.GetActualMBSize();
+                actualMBSize = net.DetermineActualMBSizeFromFeatures();
                 if (actualMBSize != 0)
                 {
                     nSamplesSinceLastModelSync += actualMBSize;
@@ -1972,6 +1972,7 @@ template<class ElemType>
                     {
                         refNet.SetActualMiniBatchSize(actualMBSize);
                         refNet.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
+                        // TODO: not setting MBLayout?
                         refNet.Evaluate(refNode);
                         Matrix<ElemType>::ScaleAndAdd((ElemType)m_adaptationRegWeight,
                                                       dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->FunctionValues(),
diff --git a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
index afad9252e16a..3f2b1445806c 100644
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@@ -127,7 +127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-                actualMBSize = m_net.GetActualMBSize();
+                actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
                 m_net.SetActualMiniBatchSize(actualMBSize);
                 m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
                 dataReader->CopyMBLayoutTo(m_net.GetMBLayoutPtr());
@@ -445,7 +445,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 auto preader = dataReaders.begin();
                 for (auto ptr = nets.begin(); ptr != nets.end(); ptr++, preader++)
                 {
-                    actualMBSize = (*ptr)->GetActualMBSize();
+                    actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
                     if (actualMBSize == 0)
                         LogicError("decoderTrainSetDataReader read data but encoderNet reports no data read");
 
@@ -460,10 +460,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 decoderNet = nets[iNumNets - 1];
                 /// not the sentence begining, because the initial hidden layer activity is from the encoder network
-                actualMBSize = decoderNet->GetActualMBSize();
-                decoderNet->SetActualMiniBatchSize(actualMBSize);
+                actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
                 if (actualMBSize == 0)
                     LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read");
+                decoderNet->SetActualMiniBatchSize(actualMBSize);
                 decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderDataReader->NumberSlicesInEachRecurrentIter());
                 decoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
 
@@ -663,10 +663,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++, ptrreader++)
                 {
                     /// evaluate on the encoder networks
-                    actualMBSize = (*ptr)->GetActualMBSize();
+                    actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
 
-                    (*ptr)->SetActualMiniBatchSize(actualMBSize);
                     mNutt = (*ptrreader)->NumberSlicesInEachRecurrentIter();
+                    (*ptr)->SetActualMiniBatchSize(actualMBSize);
                     (*ptr)->SetActualNbrSlicesInEachRecurentIteration(mNutt);
                     (*ptrreader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
 
@@ -771,7 +771,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-            size_t actualMBSize = net.GetActualMBSize();
+            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
             net.SetActualMiniBatchSize(actualMBSize);
             for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
             {
@@ -846,7 +846,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-                actualMBSize = m_net.GetActualMBSize();
+                actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
                 m_net.SetActualMiniBatchSize(actualMBSize);
 
                 vector<size_t> best_path;
@@ -904,7 +904,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             vector<double> evalResults;
 
             size_t mbSize;
-            mbSize = evalnet->GetActualMBSize();
+            mbSize = evalnet->DetermineActualMBSizeFromFeatures();
             size_t maxMbSize = 2 * mbSize;
 
             /// use reader to initialize evalnet's sentence start information to let it know that this
@@ -930,7 +930,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
             /// is the begining of sentence
-            evalnet->SetActualMiniBatchSize(1, &featureNodes);
+            evalnet->SetActualMiniBatchSize(1/*, &featureNodes*/);
+            for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++)
+            {
+                size_t nr = (*ptr)->GetNumRows();
+                (*ptr)->Resize(nr, 1);
+            }
+
             dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
             /// need to set the sentence begining segmentation info
             evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::SequenceStart));
@@ -1067,9 +1073,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             maxMbSize = 2;
 #endif
             /// use reader to initialize evalnet's sentence start information to let it know that this
-            /// is the begining of sentence
+            /// is the beginning of sentence
             evalnet->SetActualMiniBatchSize(mbSize);
             evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
+            // TODO: not setting MBLayout?
 
             clock_t start, now;
             start = clock();
diff --git a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
index fcce99d34e17..1fc6cc8bd9ea 100644
--- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
+++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
@@ -72,7 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-                size_t actualMBSize = m_net.GetActualMBSize();
+                size_t actualMBSize = m_net.DetermineActualMBSizeFromFeatures();      // TODO: should this be dataReader.DetermineActualMBSizeFromFeatures()?
                 m_net.SetActualMiniBatchSize(actualMBSize);
                 m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader.NumberSlicesInEachRecurrentIter());
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
@@ -154,7 +154,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-                size_t actualMBSize = m_net.GetActualMBSize();
+                size_t actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
                 m_net.SetActualMiniBatchSize(actualMBSize);
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
 
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index cef4d3143489..317e7387e6bf 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -515,11 +515,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     typedef Matrix<float> SingleMatrix;
     typedef Matrix<double> DoubleMatrix;
 
-    // TODO: move this to an appropriate place and name it properly
     // MBLayout -- layout information of minibatch
     // Currently this is to bind the two somewhat inconsistent boundary flags and packing flags.
-    // Once that is unified, we can clean it up further. For now, it's just moving the data members.
+    // Once that is unified, we can clean it up further. For now, it's just moving the data members and encapsulating access to them where possible.
     // This should probably also contain m_actualNbrSlicesInEachRecIter (which should be node-dependent).
+    // TODO: move this to an appropriate place and name it properly
+    // NOTE: This class represents an abstraction of an originally distributed/code-duped way of defining and accessing the MB layout.
+    //       The code below represents the actual use cases I encountered. Not all are, I believe, needed to be as they are; this class could be simplified/streamlined much further.
+    //       Some wackiness below is explained by this.
+    // TODO: frame-randoized MBs are now represented as one stream of many frames. This is wrong; they should be one-frame utterances with many streams. Once we fully abstract out Data access, this can be changed easily.
     struct MBLayout
     {   
         MBLayout() : m_sentenceBoundaryFlags(CPUDEVICE) { }

From 37f39ac1efd0b9236e893d5797696e852e10890d Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 16:32:49 -0700
Subject: [PATCH 06/44] streamlined pattern { actualMBSize =
 DetermineActualMBSizeFromFeatures(); SetActualMiniBatchSize(actualMBSize); }
 to SetActualMiniBatchSizeFromFeatures() (keeps things a little more
 encapsulated. Unfortunately, there are still a few exceptions)

---
 .../ComputationNetwork.cpp                    |  3 +--
 .../ComputationNetwork.h                      | 24 ++++++++++++------
 MachineLearning/CNTKSGDLib/MultiNetworksSGD.h |  8 ++----
 MachineLearning/CNTKSGDLib/SGD.cpp            | 16 ++++++------
 MachineLearning/CNTKSGDLib/SimpleEvaluator.h  | 25 ++++++-------------
 .../CNTKSGDLib/SimpleOutputWriter.h           |  6 ++---
 6 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index d5b08827e64e..960cf8c45d54 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -193,8 +193,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
 
-        size_t actualMBSize = DetermineActualMBSizeFromFeatures();
-        SetActualMiniBatchSize(actualMBSize);
+        SetActualMiniBatchSizeFromFeatures();
 
         if (requireValidation)
             ValidateNetwork();
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 4454f83499fc..5bd1ba82bf11 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -654,6 +654,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     }
 
     // resize entire network to handle a given MB size
+    // TODO: actually it only updates nodes in m_recurrentInfo. Why? Because without recurrence, size never changes?
     // TODO: Is this always called with the result of DetermineActualMBSizeFromFeatures()? Why would it ever not?
     void SetActualMiniBatchSize(const size_t aSize)
     {
@@ -666,18 +667,27 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             m_recurrentInfo[i].m_completedGradient = false;
         }
 
+        // resize function values and gradients of everything in m_recurrentInfo
         for (int i = 0; i < m_recurrentInfo.size(); i++)
-            for (auto nodeIter = m_recurrentInfo[i].m_recurrentNodes.begin(); nodeIter != m_recurrentInfo[i].m_recurrentNodes.end(); nodeIter++)
-                (*nodeIter)->SetFunctionAndGradientSize(m_actMiniBSize);
+            for (auto nodeIter : m_recurrentInfo[i].m_recurrentNodes)
+                nodeIter->SetFunctionAndGradientSize(m_actMiniBSize);
+    }
+
+    // it is used this way most of the time
+    size_t SetActualMiniBatchSizeFromFeatures()
+    {
+        size_t aSize = DetermineActualMBSizeFromFeatures();
+        SetActualMiniBatchSize(aSize);
+        return aSize;
     }
 
     // GetMaxMBSize - Get the maximum minibatch size that will be seen in a training run
     // returns the result from SetActualMiniBatchSize(). Note DetermineActualMBSizeFromFeatures() also exists but returns a value derived from the inputs dimensions
     size_t GetMaxMBSize() { return m_actMiniBSize; }
 
-    // always called in this pattern:
 #if 0
-    evalnet->SetActualMiniBatchSize(mbSize);
+    // always called in this pattern:
+    evalnet->SetActualMiniBatchSizeFromFeatures();
     evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
     dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
     // well... most of the time. Not in TrainOneEpoch().
@@ -1105,8 +1115,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                 if (!allowFragment)
                     FormRecurrentLoops(node);
                 PrintComputationTree(node, false);
-                size_t actualMBSize = this->DetermineActualMBSizeFromFeatures();
-                this->SetActualMiniBatchSize(actualMBSize);
+                SetActualMiniBatchSizeFromFeatures();
                 ValidateSubNetwork(node);
             }
         }
@@ -1277,8 +1286,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             {
                 if (!allowFragment)
                     FormRecurrentLoops(node);
-                size_t actualMBSize = this->DetermineActualMBSizeFromFeatures();
-                this->SetActualMiniBatchSize(actualMBSize);
+                this->SetActualMiniBatchSizeFromFeatures();
                 if (!UnitTest(node))
                     vErrors.push_back(node->NodeName().c_str());
             }
diff --git a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
index 499230767b45..2763b02a20f1 100644
--- a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
+++ b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
@@ -1157,17 +1157,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType>& localEpochEvalErrors
             )
         {
-            size_t actualMBSize = encoderNet->DetermineActualMBSizeFromFeatures();
-
-            encoderNet->SetActualMiniBatchSize(actualMBSize);
+            encoderNet->SetActualMiniBatchSizeFromFeatures();
             encoderNet->SetActualNbrSlicesInEachRecurentIteration(encoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
             encoderTrainSetDataReader->CopyMBLayoutTo(encoderNet->GetMBLayoutPtr());
 
             encoderNet->Evaluate(encoderEvaluationNodes[0]);
 
-            actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
-
-            decoderNet->SetActualMiniBatchSize(actualMBSize);
+            decoderNet->SetActualMiniBatchSizeFromFeatures();
             decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
             decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
             /// not the sentence begining, because the initial hidden layer activity is from the encoder network
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index 9775718f0de4..4adcf874fbaf 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -1306,8 +1306,7 @@ template<class ElemType>
             ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
             ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
-            net.SetActualMiniBatchSize(actualMBSize);
+            net.SetActualMiniBatchSizeFromFeatures();
             net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
 
@@ -1766,8 +1765,7 @@ template<class ElemType>
             if (outputNodes.empty())
                 LogicError("no output node was found.");
 
-            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
-            net.SetActualMiniBatchSize(actualMBSize);
+            net.SetActualMiniBatchSizeFromFeatures();
             net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
             net.Evaluate(outputNodes[0]);   // Only evaluate the first output
@@ -1943,23 +1941,23 @@ template<class ElemType>
                     }
                 }
 
-                actualMBSize = net.DetermineActualMBSizeFromFeatures();
+                actualMBSize = net.SetActualMiniBatchSizeFromFeatures();
                 if (actualMBSize != 0)
                 {
-                    nSamplesSinceLastModelSync += actualMBSize;
-                    net.SetActualMiniBatchSize(actualMBSize);
-                    net.SetActualNbrSlicesInEachRecurentIteration(nSlices);
-
                     if (!useDistributedMBReading && useParallelTrain && trainSetDataReader->RequireSentenceSeg())
                     {
+                        net.SetActualNbrSlicesInEachRecurentIteration(nSlices);
                         *net.GetMBLayoutPtr() = *pMBLayout;
                         // TODO: ^^ we should just pass pointers; this current code is semantically identical to before the change to MBLayout
                     }
                     else
                     {
+                        net.SetActualNbrSlicesInEachRecurentIteration(nSlices);
                         trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
                     }
 
+                    nSamplesSinceLastModelSync += actualMBSize;
+
                     ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
                     ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
diff --git a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
index 3f2b1445806c..6306903b36b1 100644
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@@ -127,8 +127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-                actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
-                m_net.SetActualMiniBatchSize(actualMBSize);
+                actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
                 m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
                 dataReader->CopyMBLayoutTo(m_net.GetMBLayoutPtr());
 
@@ -445,11 +444,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 auto preader = dataReaders.begin();
                 for (auto ptr = nets.begin(); ptr != nets.end(); ptr++, preader++)
                 {
-                    actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
+                    actualMBSize = (*ptr)->SetActualMiniBatchSizeFromFeatures();
                     if (actualMBSize == 0)
                         LogicError("decoderTrainSetDataReader read data but encoderNet reports no data read");
-
-                    (*ptr)->SetActualMiniBatchSize(actualMBSize);
                     (*ptr)->SetActualNbrSlicesInEachRecurentIteration((*preader)->NumberSlicesInEachRecurrentIter());
                     (*preader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
 
@@ -460,10 +457,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 decoderNet = nets[iNumNets - 1];
                 /// not the sentence begining, because the initial hidden layer activity is from the encoder network
-                actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
+                actualMBSize = decoderNet->SetActualMiniBatchSizeFromFeatures();
                 if (actualMBSize == 0)
                     LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read");
-                decoderNet->SetActualMiniBatchSize(actualMBSize);
                 decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderDataReader->NumberSlicesInEachRecurrentIter());
                 decoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
 
@@ -657,16 +653,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputationNetwork::UpdateEvalTimeStamps(featNodes);
                 }
 
-
                 auto ptrreader = readers.begin();
                 size_t mNutt = 0;
                 for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++, ptrreader++)
                 {
                     /// evaluate on the encoder networks
-                    actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
+                    actualMBSize = (*ptr)->SetActualMiniBatchSizeFromFeatures();
 
                     mNutt = (*ptrreader)->NumberSlicesInEachRecurrentIter();
-                    (*ptr)->SetActualMiniBatchSize(actualMBSize);
                     (*ptr)->SetActualNbrSlicesInEachRecurentIteration(mNutt);
                     (*ptrreader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
 
@@ -771,12 +765,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-            size_t actualMBSize = net.DetermineActualMBSizeFromFeatures();
-            net.SetActualMiniBatchSize(actualMBSize);
+            net.SetActualMiniBatchSizeFromFeatures();
             for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
-            {
                 net.Evaluate(*nodeIter);
-            }
 
             //mark done
             for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
@@ -846,8 +837,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-                actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
-                m_net.SetActualMiniBatchSize(actualMBSize);
+                actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
 
                 vector<size_t> best_path;
 
@@ -930,7 +920,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
             /// is the begining of sentence
-            evalnet->SetActualMiniBatchSize(1/*, &featureNodes*/);
+            evalnet->SetActualMiniBatchSize(1);
             for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++)
             {
                 size_t nr = (*ptr)->GetNumRows();
@@ -1097,6 +1087,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
             /// is the begining of sentence
+            // BUGBUG: This is almost certainly wrong; slice != MB size
             evalnet->SetActualMiniBatchSize(dataReader->NumberSlicesInEachRecurrentIter());
 
             double best_score = -numeric_limits<double>::infinity();
diff --git a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
index 1fc6cc8bd9ea..9adb3184c820 100644
--- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
+++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
@@ -72,8 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
-                size_t actualMBSize = m_net.DetermineActualMBSizeFromFeatures();      // TODO: should this be dataReader.DetermineActualMBSizeFromFeatures()?
-                m_net.SetActualMiniBatchSize(actualMBSize);
+                size_t actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
                 m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader.NumberSlicesInEachRecurrentIter());
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
 
@@ -154,8 +153,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
-                size_t actualMBSize = m_net.DetermineActualMBSizeFromFeatures();
-                m_net.SetActualMiniBatchSize(actualMBSize);
+                size_t actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
 
                 for (int i=0; i<outputNodes.size(); i++)

From af316bffb86890713b780640c3dc6fac0b1ea20e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 17:35:07 -0700
Subject: [PATCH 07/44] changed { SetActualNbrSlicesInEachRecurentIteration();
 CopyMBLayoutTo(); } to { CopyMBLayoutTo();
 VerifyActualNumParallelSequences(); } and removed
 m_actualNbrSlicesInEachRecurentIteration; renamed
 ...NbrSlicesEachRecurrentIter to ...NumParallelSequences; also
 MBLayout::GetNumStreams() to GetNumTimeSteps() and GetNumFrames() to
 GetNumTimeSteps()

---
 Common/DataReader.cpp                         | 15 +++----
 Common/Include/DataReader.h                   |  6 +--
 DataReader/BinaryReader/BinaryReader.h        |  5 +--
 DataReader/DSSMReader/DSSMReader.h            |  4 +-
 DataReader/HTKMLFReader/HTKMLFReader.cpp      |  1 +
 DataReader/HTKMLFReader/HTKMLFReader.h        |  4 +-
 .../LMSequenceReader/SequenceReader.cpp       |  2 +-
 DataReader/LMSequenceReader/SequenceReader.h  |  2 +-
 .../LUSequenceReader/LUSequenceReader.cpp     | 14 +++----
 .../LUSequenceReader/LUSequenceReader.h       |  8 ++--
 .../LibSVMBinaryReader/LibSVMBinaryReader.h   |  4 +-
 DataReader/SparsePCReader/SparsePCReader.h    |  4 +-
 DataReader/UCIFastReader/UCIFastReader.cpp    |  2 +-
 DataReader/UCIFastReader/UCIFastReader.h      |  4 +-
 .../ComputationNetwork.h                      | 42 ++++++++++++-------
 .../ComputationNode.h                         |  8 ++--
 .../RecurrentNodes.h                          |  2 +-
 .../TrainingCriterionNodes.h                  |  2 +-
 MachineLearning/CNTKEval/EvalReader.h         |  4 +-
 MachineLearning/CNTKSGDLib/MultiNetworksSGD.h |  4 +-
 MachineLearning/CNTKSGDLib/SGD.cpp            | 29 ++++++-------
 MachineLearning/CNTKSGDLib/SimpleEvaluator.h  | 27 ++++++------
 .../CNTKSGDLib/SimpleOutputWriter.h           |  5 ++-
 Math/Math/Matrix.h                            | 12 +++---
 24 files changed, 112 insertions(+), 98 deletions(-)

diff --git a/Common/DataReader.cpp b/Common/DataReader.cpp
index 81f61c5e513a..764a7ae1ac9b 100644
--- a/Common/DataReader.cpp
+++ b/Common/DataReader.cpp
@@ -110,7 +110,7 @@ DataReader<ElemType>::DataReader(const ConfigParameters& config)
     for (size_t i = 0; i < m_ioNames.size(); i++)
     {
         m_dataReader[m_ioNames[i]]->Init(m_configure[m_ioNames[i]]);
-        m_dataReader[m_ioNames[i]]->SetNbrSlicesEachRecurrentIter(mNbrUttPerMinibatch);
+        m_dataReader[m_ioNames[i]]->SetNumParallelSequences(mNbrUttPerMinibatch);
     }
 }
 
@@ -191,9 +191,9 @@ bool DataReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*
     for (size_t i = 0; i < m_ioNames.size(); i++)
     {
         if (nbr > 0)
-            m_dataReader[m_ioNames[i]]->SetNbrSlicesEachRecurrentIter(nbr);
+            m_dataReader[m_ioNames[i]]->SetNumParallelSequences(nbr);
         bRet &= m_dataReader[m_ioNames[i]]->GetMinibatch(matrices);
-        thisNbr = m_dataReader[m_ioNames[i]]->NumberSlicesInEachRecurrentIter();
+        thisNbr = m_dataReader[m_ioNames[i]]->GetNumParallelSequences();
         if (nbr > 0 && thisNbr != nbr)
             LogicError("DataReader<ElemType>::GetMinibatch: The specified number of utterances per minibatch is not consistent to the actual number of utterances per minibatch");
         nbr = thisNbr;
@@ -202,16 +202,16 @@ bool DataReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*
 }
 
 template<class ElemType>
-size_t DataReader<ElemType>::NumberSlicesInEachRecurrentIter()
+size_t DataReader<ElemType>::GetNumParallelSequences()
 {
     size_t nNbr = 0; 
     for (size_t i = 0; i < m_ioNames.size(); i++)
     {
         IDataReader<ElemType> * ptr = m_dataReader[m_ioNames[i]];
         if (nNbr == 0)
-            nNbr = ptr->NumberSlicesInEachRecurrentIter();
-        if (nNbr != ptr->NumberSlicesInEachRecurrentIter())
-            LogicError("NumberSlicesInEachRecurrentIter: number of slices in each minibatch not consistent for these streams");
+            nNbr = ptr->GetNumParallelSequences();
+        if (nNbr != ptr->GetNumParallelSequences())
+            LogicError("GetNumParallelSequences: number of slices in each minibatch not consistent for these streams");
     }
     return nNbr;
 }
@@ -244,6 +244,7 @@ bool DataReader<ElemType>::GetProposalObs(std::map<std::wstring, Matrix<ElemType
 template<class ElemType>
 void DataReader<ElemType>::CopyMBLayoutTo(MBLayoutPtr pMBLayout)
 {
+    // BUGBUG: This copies all data reader's layout info on top of each other, keeping only the last one; likely not what was intended.
     for (size_t i = 0; i < m_ioNames.size(); i++)
         m_dataReader[m_ioNames[i]]->CopyMBLayoutTo(pMBLayout);
 }
diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h
index 0901826b0039..9e7007071549 100644
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@@ -78,9 +78,9 @@ class DATAREADER_API IDataReader
     }
 
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) = 0;
-    virtual size_t NumberSlicesInEachRecurrentIter() = 0; 
+    virtual size_t GetNumParallelSequences() = 0; 
     virtual int GetSentenceEndIdFromOutputLabel() { return -1; };
-    virtual void SetNbrSlicesEachRecurrentIter(const size_t sz) { mBlgSize = sz; };
+    virtual void SetNumParallelSequences(const size_t sz) { mBlgSize = sz; };
     virtual bool RequireSentenceSeg() { return false; };
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring&) { NOT_IMPLEMENTED; };
     virtual void SetLabelMapping(const std::wstring&, const std::map<LabelIdType, LabelType>&) { NOT_IMPLEMENTED; };
@@ -198,7 +198,7 @@ class DataReader: public IDataReader<ElemType>, protected Plugin
     // returns - true if there are more minibatches, false if no more minibatchs remain
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter();
+    size_t GetNumParallelSequences();
     int GetSentenceEndIdFromOutputLabel();
 
     // GetLabelMapping - Gets the label mapping from integer index to label type 
diff --git a/DataReader/BinaryReader/BinaryReader.h b/DataReader/BinaryReader/BinaryReader.h
index 17061d5caf1a..1cb5c979394c 100644
--- a/DataReader/BinaryReader/BinaryReader.h
+++ b/DataReader/BinaryReader/BinaryReader.h
@@ -419,10 +419,9 @@ class BinaryReader : public IDataReader<ElemType>
     virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter() { return 1 ;} 
-    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+    size_t GetNumParallelSequences() { return 1 ;} 
+    void SetNumParallelSequences(const size_t) { };
     void CopyMBLayoutTo(MBLayoutPtr) {};
-
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
     virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<typename BinaryReader<ElemType>::LabelIdType, typename BinaryReader<ElemType>::LabelType>& labelMapping);
     virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
diff --git a/DataReader/DSSMReader/DSSMReader.h b/DataReader/DSSMReader/DSSMReader.h
index b6bf9059e7c2..679fb3012331 100644
--- a/DataReader/DSSMReader/DSSMReader.h
+++ b/DataReader/DSSMReader/DSSMReader.h
@@ -141,8 +141,8 @@ class DSSMReader : public IDataReader<ElemType>
     virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter() { return 1 ;} 
-    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+    size_t GetNumParallelSequences() { return 1 ;} 
+    void SetNumParallelSequences(const size_t) { };
     void CopyMBLayoutTo(MBLayoutPtr) {};
 
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp
index 13b2131fe7ab..ec015a65e51d 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@@ -1607,6 +1607,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (!m_framemode)
                 *pMBLayout = *m_pMBLayout;
+            // TODO: what about frame mode? Should we create a dummy one? Or Clear() it? Reader should not know what ComputationNetworks' defaults are.
         }
 
 
diff --git a/DataReader/HTKMLFReader/HTKMLFReader.h b/DataReader/HTKMLFReader/HTKMLFReader.h
index a6bc99aa4784..6091572e4e3b 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@@ -89,8 +89,8 @@ class HTKMLFReader : public IDataReader<ElemType>
 
     bool ReNewBufferForMultiIO(size_t i);
 
-    size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} 
-    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+    size_t GetNumParallelSequences() { return m_numberOfuttsPerMinibatch; } 
+    void SetNumParallelSequences(const size_t) { };
 
      void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
 
diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp
index 47c14b472b1a..854705ece8cd 100644
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@@ -1812,7 +1812,7 @@ bool BatchSequenceReader<ElemType>::EnsureDataAvailable(size_t /*mbStartSample*/
 }
 
 template<class ElemType>
-size_t BatchSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
+size_t BatchSequenceReader<ElemType>::GetNumParallelSequences()
 {
     size_t sz = mToProcess.size();
     if (sz == 0)
diff --git a/DataReader/LMSequenceReader/SequenceReader.h b/DataReader/LMSequenceReader/SequenceReader.h
index 4c774a2ddbc5..bb85f48a8f14 100644
--- a/DataReader/LMSequenceReader/SequenceReader.h
+++ b/DataReader/LMSequenceReader/SequenceReader.h
@@ -393,7 +393,7 @@ class BatchSequenceReader : public SequenceReader<ElemType>
     void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
     bool EnsureDataAvailable(size_t mbStartSample);
-    size_t NumberSlicesInEachRecurrentIter();
+    size_t GetNumParallelSequences();
 
     void SetSentenceSegBatch(std::vector<size_t> &sentenceEnd);
     void CopyMBLayoutTo(MBLayoutPtr);
diff --git a/DataReader/LUSequenceReader/LUSequenceReader.cpp b/DataReader/LUSequenceReader/LUSequenceReader.cpp
index 8070ed17ef2f..e50eab054989 100644
--- a/DataReader/LUSequenceReader/LUSequenceReader.cpp
+++ b/DataReader/LUSequenceReader/LUSequenceReader.cpp
@@ -799,7 +799,7 @@ bool BatchLUSequenceReader<ElemType>::EnsureDataAvailable(size_t /*mbStartSample
 }
 
 template<class ElemType>
-size_t BatchLUSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
+size_t BatchLUSequenceReader<ElemType>::GetNumParallelSequences()
 {
     size_t sz = (mSentenceBeginAt.size() == 0)?mBlgSize : mSentenceBeginAt.size();
     if (mSentenceBeginAt.size() == 0)
@@ -814,7 +814,7 @@ size_t BatchLUSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
 }
 
 template<class ElemType>
-void BatchLUSequenceReader<ElemType>::SetNbrSlicesEachRecurrentIter(const size_t mz)
+void BatchLUSequenceReader<ElemType>::SetNumParallelSequences(const size_t mz)
 {
     mBlgSize = mz;
 }
@@ -1276,19 +1276,19 @@ void MultiIOBatchLUSequenceReader<ElemType>::CopyMBLayoutTo(MBLayoutPtr pMBLayou
     {
         p.second->CopyMBLayoutTo(pMBLayout);
         if (rows == 0)
-            rows = pMBLayout->GetNumStreams();
-        else if (rows != pMBLayout->GetNumStreams())
+            rows = pMBLayout->GetNumParallelSequences();
+        else if (rows != pMBLayout->GetNumParallelSequences())
             LogicError("multiple streams for LU sequence reader must have the same number of rows for sentence begining");
-        size_t this_col = pMBLayout->GetNumFrames();
+        size_t this_col = pMBLayout->GetNumTimeSteps();
         col.push_back(this_col);
         cols += this_col;
     }
 }
 
 template<class ElemType>
-size_t MultiIOBatchLUSequenceReader<ElemType>::NumberSlicesInEachRecurrentIter()
+size_t MultiIOBatchLUSequenceReader<ElemType>::GetNumParallelSequences()
 {
-    return mReader.begin()->second->NumberSlicesInEachRecurrentIter();
+    return mReader.begin()->second->GetNumParallelSequences();
 }
 
 template<class ElemType>
diff --git a/DataReader/LUSequenceReader/LUSequenceReader.h b/DataReader/LUSequenceReader/LUSequenceReader.h
index e6f8eb7641b6..affc627ff7f1 100644
--- a/DataReader/LUSequenceReader/LUSequenceReader.h
+++ b/DataReader/LUSequenceReader/LUSequenceReader.h
@@ -184,7 +184,7 @@ class LUSequenceReader : public IDataReader<ElemType>
     ~LUSequenceReader(){};
     void StartMinibatchLoop(size_t , size_t , size_t = requestDataSize) {};
 
-    void SetNbrSlicesEachRecurrentIter(const size_t /*mz*/) {};
+    void SetNumParallelSequences(const size_t /*mz*/) {};
     void SentenceEnd(std::vector<size_t> &/*sentenceEnd*/) {};
 
     virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);
@@ -298,8 +298,8 @@ class BatchLUSequenceReader : public LUSequenceReader<ElemType>
     bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
     bool EnsureDataAvailable(size_t mbStartSample);
-    size_t NumberSlicesInEachRecurrentIter();
-    void SetNbrSlicesEachRecurrentIter(const size_t mz);
+    size_t GetNumParallelSequences();
+    void SetNumParallelSequences(const size_t mz);
 
     void CopyMBLayoutTo(MBLayoutPtr pMBLayout);
 
@@ -386,7 +386,7 @@ class MultiIOBatchLUSequenceReader : public BatchLUSequenceReader<ElemType>
 
     void CopyMBLayoutTo(MBLayoutPtr pMBLayout);
 
-    size_t NumberSlicesInEachRecurrentIter();
+    size_t GetNumParallelSequences();
 
     void Init(const ConfigParameters& readerConfig);
 
diff --git a/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h b/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h
index 9ca9a6f0ba32..98f2d04a2780 100644
--- a/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h
+++ b/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h
@@ -143,8 +143,8 @@ class LibSVMBinaryReader : public IDataReader<ElemType>
     virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter() { return 1 ;} 
-    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+    size_t GetNumParallelSequences() { return 1; } 
+    void SetNumParallelSequences(const size_t) { };
     void CopyMBLayoutTo(MBLayoutPtr){};
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
     virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, typename LabelType>& labelMapping);
diff --git a/DataReader/SparsePCReader/SparsePCReader.h b/DataReader/SparsePCReader/SparsePCReader.h
index ac5b6ea00006..2ae72bbccc82 100644
--- a/DataReader/SparsePCReader/SparsePCReader.h
+++ b/DataReader/SparsePCReader/SparsePCReader.h
@@ -56,8 +56,8 @@ class SparsePCReader : public IDataReader<ElemType>
     virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter() { return 1 ;} 
-    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+    size_t GetNumParallelSequences() { return 1 ;} 
+    void SetNumParallelSequences(const size_t) { };
     void CopyMBLayoutTo(MBLayoutPtr) {};
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
     virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, typename LabelType>& labelMapping);
diff --git a/DataReader/UCIFastReader/UCIFastReader.cpp b/DataReader/UCIFastReader/UCIFastReader.cpp
index 26ad512a151c..e7aeea2a8871 100644
--- a/DataReader/UCIFastReader/UCIFastReader.cpp
+++ b/DataReader/UCIFastReader/UCIFastReader.cpp
@@ -630,7 +630,7 @@ size_t RoundUp(size_t value, size_t size)
 }
 
 template<class ElemType>
-void UCIFastReader<ElemType>::SetNbrSlicesEachRecurrentIter(const size_t sz) 
+void UCIFastReader<ElemType>::SetNumParallelSequences(const size_t sz) 
 {
     mBlgSize = sz; 
     if (mOneLinePerFile)
diff --git a/DataReader/UCIFastReader/UCIFastReader.h b/DataReader/UCIFastReader/UCIFastReader.h
index 29228e5b025d..c0b6ccb57b46 100644
--- a/DataReader/UCIFastReader/UCIFastReader.h
+++ b/DataReader/UCIFastReader/UCIFastReader.h
@@ -111,7 +111,7 @@ class UCIFastReader : public IDataReader<ElemType>
     virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
 
-    size_t NumberSlicesInEachRecurrentIter() { return mBlgSize; }
+    size_t GetNumParallelSequences() { return mBlgSize; }
     void CopyMBLayoutTo(MBLayoutPtr){};
     virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
     virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
@@ -120,7 +120,7 @@ class UCIFastReader : public IDataReader<ElemType>
     virtual bool DataEnd(EndDataType endDataType);
     void SetSentenceSegBatch(Matrix<float>&, Matrix<ElemType>&) { };
 
-    void SetNbrSlicesEachRecurrentIter(const size_t sz);
+    void SetNumParallelSequences(const size_t sz);
 
     void SetRandomSeed(int) { NOT_IMPLEMENTED;  }
 };
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 5bd1ba82bf11..adf9136b9ad5 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -80,7 +80,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         m_randomSeedOffset = 0;
         m_actMiniBSize = 0;
         SetDeviceId(deviceId);
-        m_nbrSlicesInEachRecurrentIterationx = 1;
     }
 
     virtual ~ComputationNetwork()
@@ -246,8 +245,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     {
         if (!m_pMBLayout->IsAllNone())
         {
-            size_t numTimeSteps = m_pMBLayout->GetNumFrames();
-            size_t numSequences = m_pMBLayout->GetNumStreams();
+            size_t numTimeSteps = m_pMBLayout->GetNumTimeSteps();
+            size_t numSequences = m_pMBLayout->GetNumParallelSequences();
 
             if (m_pMBLayout->GetSize() != numTimeSteps)
                 LogicError("GetNumSamplesWithLabel(): m_pMBLayout->m_minibatchPackingFlags should have one element for each timestep of all streams.Check feature reader. ");
@@ -558,7 +557,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
                 (*nodeIter)->SetFunctionAndGradientSize(m_actMiniBSize);
 
-            int iMBSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIterationx;
+            int iMBSize = m_actMiniBSize / GetNumParallelSequences();
 
             if (m_recurrentInfo[iLoopId].m_isForwardLoop)
             {
@@ -598,10 +597,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     {
         // checks that will disappear once we complete the refactoring. If this passes for a while, we will eliminate one
         // If this fails, comment this out (it is safe) and tell fseide@microsoft.com.
-        if (m_nbrSlicesInEachRecurrentIterationx != m_pMBLayout->GetNumStreams())
-            LogicError("Evaluate: detected that m_nbrSlicesInEachRecurrentIteration != m_pMBLayout->GetNumStreams()");
-        if (m_pMBLayout->GetNumFrames() != m_pMBLayout->GetSize())
-            LogicError("Evaluate: detected that m_pMBLayout->GetNumFrames() != m_pMBLayout->GetSize()");
+        if (GetNumParallelSequences() != m_pMBLayout->GetNumParallelSequences())
+            LogicError("Evaluate: detected that m_nbrSlicesInEachRecurrentIteration != m_pMBLayout->GetNumParallelSequences()");
+        if (m_pMBLayout->GetNumTimeSteps() != m_pMBLayout->GetSize())
+            LogicError("Evaluate: detected that m_pMBLayout->GetNumTimeSteps() != m_pMBLayout->GetSize()");
 
         // prepare to compute with the subnetwork that this rootNode depends on, including
         //  - auto-detecting recurrent loops
@@ -625,7 +624,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
             // TODO: nbrSlices set once to the same value for all nodes each evaluation--is it ever changed later?
-            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx);
+            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences());
             if ((*nodeIter)->ReqMultiSeqHandling())
                 (*nodeIter)->ResetBound(m_pMBLayout);
         }
@@ -688,13 +687,25 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 #if 0
     // always called in this pattern:
     evalnet->SetActualMiniBatchSizeFromFeatures();
-    evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
     dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
+    evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
     // well... most of the time. Not in TrainOneEpoch().
+    void SetActualNumParallelSequencesInEachRecurentIteration(const size_t aSize)
+    {
+        m_nbrSlicesInEachRecurrentIteration() = aSize;   // TODO: this has to go
+    }
 #endif
-    void SetActualNbrSlicesInEachRecurentIteration(const size_t aSize)
+    size_t GetNumParallelSequences() const
+    {
+        return m_pMBLayout->GetNumParallelSequences();
+    }
+    // temporary function: Call this after CopyMBLayoutTo(evalnet->GetMBLayoutPtr()) to ensure everything is consistent as expected
+    // It is actually called after every CopyMBLayoutTo() in the entire system (except for multi-reader CopyMBLayoutTo() itself).
+    // Remove this function after a few weeks of not firing.
+    void VerifyActualNumParallelSequences(const size_t aSize)
     {
-        m_nbrSlicesInEachRecurrentIterationx = aSize;
+        if (GetNumParallelSequences() != aSize)
+            LogicError("VerifyActualNumParallelSequences: mismatching MB size in MBLayout");
     }
 
     void ComputeGradientLoop(std::list<ComputationNodeBasePtr>& /*allNodes*/, const ComputationNodeBasePtr startNode)
@@ -705,14 +716,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         {
             if (m_recurrentInfo[iLoopId].m_completedGradient == false)
             {
-                int mbSize = m_actMiniBSize / m_nbrSlicesInEachRecurrentIterationx;
+                int mbSize = m_actMiniBSize / GetNumParallelSequences();
                 if (m_recurrentInfo[iLoopId].m_isForwardLoop)
                 {
                     for (int timeIndex = mbSize - 1; timeIndex >= 0; timeIndex--)
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx); // TODO: move to FrameRange object
+                            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences()); // TODO: move to FrameRange object
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -723,7 +734,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNbrSlicesInEachRecurrentIteration(m_nbrSlicesInEachRecurrentIterationx);
+                            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences());
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -1571,7 +1582,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     MBLayoutPtr m_pMBLayout;
 
     int m_actMiniBSize;
-    size_t m_nbrSlicesInEachRecurrentIterationx;
 
     // main node holder
     std::map<const std::wstring, ComputationNodeBasePtr, nocase_compare> m_nameToNodeMap;   // [name] -> node; this is the main container that holds this networks' nodes
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 45abce8277c6..b3542c092fe5 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -168,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void ResetBound(MBLayoutPtr pMBLayout)
         {
-            assert(pMBLayout->GetNumFrames() == pMBLayout->GetSize());  // TODO: move this check into MBLayout
+            assert(pMBLayout->GetNumTimeSteps() == pMBLayout->GetSize());  // TODO: move this check into MBLayout
             m_pMBLayout = pMBLayout;
         }
 
@@ -248,7 +248,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // TODO: these two will disappear once the information is correctly held in a FrameRange record
         // This is called at 3 places; two are directly before ComputeGradientForChildren().
-        void SetNbrSlicesInEachRecurrentIteration(size_t bsz)
+        void SetNumParallelSequences(size_t bsz)
         {
             m_samplesInRecurrentStep = bsz;
         }
@@ -260,7 +260,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // This expression will be turned into a function call to right here, so that we compute this only at one place
         // and can also handle the full-minibatch case.
         // Let us try to get this member out of this class altogether; it belongs elsewhere.
-        size_t GetNbrSlicesInEachRecurrentIteration() const
+        size_t GetNumParallelSequences() const
         {
             return m_samplesInRecurrentStep;
         }
@@ -903,7 +903,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (m_pMBLayout && !m_pMBLayout->IsAllNone())
             {
                 size_t nT = matrixToBeMasked.GetNumCols();
-                size_t nS = m_pMBLayout->GetNumStreams();
+                size_t nS = m_pMBLayout->GetNumParallelSequences();
 
                 if (m_pMBLayout->GetSize() != nT / nS)
                     LogicError("MaskToZeroWhenLabelAndFeatureMissing: m_pMBLayout->m_minibatchPackingFlags should have one element for each timestep of all streams. Check feature reader. ");
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index c038ecc2e31b..df94ccecfe7f 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -114,7 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 // then this becomes
                 //    S S X X E S S X X X E N N
 
-                size_t numRows = pMBLayout->GetNumStreams();
+                size_t numRows = pMBLayout->GetNumParallelSequences();
 
                 // each row has a number to indicate how many values should be reset for that utterance
                 vector<int> numResetLeft(numRows, 0);
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 67e0613a7dc9..ddafb73b04de 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -1098,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (m_pMBLayout && !m_pMBLayout->IsAllNone())
             {
                 // 't' is not a time but rather a column index that encodes (time stamp, stream)
-                size_t nS = m_pMBLayout->GetNumStreams();
+                size_t nS = m_pMBLayout->GetNumParallelSequences();
                 size_t j = t / nS;  // this is the time stamp
                 size_t i = t % nS;  // this is the stream
                 if (m_pMBLayout->Is(j, MinibatchPackingFlags::NoLabel)) // TODO: this outer test is redundant here, no?
diff --git a/MachineLearning/CNTKEval/EvalReader.h b/MachineLearning/CNTKEval/EvalReader.h
index 50f7421fda65..c51525e3bca9 100644
--- a/MachineLearning/CNTKEval/EvalReader.h
+++ b/MachineLearning/CNTKEval/EvalReader.h
@@ -161,9 +161,9 @@ class EvalReader : public IDataReader<ElemType>
         return true;
     }
 
-    size_t NumberSlicesInEachRecurrentIter() {return 1;}
+    size_t GetNumParallelSequences() { return 1; }
 
-    void SetNbrSlicesEachRecurrentIter(const size_t ) {}
+    void SetNumParallelSequences(const size_t ) {}
     void SetSentenceSegBatch(std::vector<size_t> &sentenceEnd)
     {
         sentenceEnd.resize(m_switchFrame.size());
diff --git a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
index 2763b02a20f1..829398c3015d 100644
--- a/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
+++ b/MachineLearning/CNTKSGDLib/MultiNetworksSGD.h
@@ -1158,14 +1158,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             )
         {
             encoderNet->SetActualMiniBatchSizeFromFeatures();
-            encoderNet->SetActualNbrSlicesInEachRecurentIteration(encoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
             encoderTrainSetDataReader->CopyMBLayoutTo(encoderNet->GetMBLayoutPtr());
+            encoderNet->VerifyActualNumParallelSequences(encoderTrainSetDataReader->GetNumParallelSequences());
 
             encoderNet->Evaluate(encoderEvaluationNodes[0]);
 
             decoderNet->SetActualMiniBatchSizeFromFeatures();
-            decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderTrainSetDataReader->NumberSlicesInEachRecurrentIter());
             decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
+            decoderNet->VerifyActualNumParallelSequences(decoderTrainSetDataReader->GetNumParallelSequences());
             /// not the sentence begining, because the initial hidden layer activity is from the encoder network
 
             if (decoderCriterionNodes.size() == 0 && decoderEvaluationNodes.size() == 0)
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index 4adcf874fbaf..e93b01140b2b 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -920,18 +920,18 @@ template<class ElemType>
         }
 
         // first, we need to normalize the effect of nbruttsineachrecurrentiter
-        if (trainSetDataReader->NumberSlicesInEachRecurrentIter() > 1 && m_needToNormalizeLRByParallUtterance)
+        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeLRByParallUtterance)
         {
             for (auto& x : m_learningRatesPerSample)
-                x /= (float)trainSetDataReader->NumberSlicesInEachRecurrentIter();
+                x /= (float)trainSetDataReader->GetNumParallelSequences();
         }
         
         // first, we need to normalize the effect of nbruttsineachrecurrentiter for momemtum
-        if (trainSetDataReader->NumberSlicesInEachRecurrentIter() > 1 && m_needToNormalizeMomentumByParallUtterance)
+        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
         {
             for (auto& x : m_momentumPerSample)
-                x = (float)pow(x, 1.0 / trainSetDataReader->NumberSlicesInEachRecurrentIter());
-            }
+                x = (float)pow(x, 1.0 / trainSetDataReader->GetNumParallelSequences());
+        }
 
         bool learnRateInitialized = false;
         if (startEpoch > 0)
@@ -1047,8 +1047,8 @@ template<class ElemType>
             }
             
             actualMinibatchSize = chosenMinibatchSize;
-            if (trainSetDataReader->NumberSlicesInEachRecurrentIter() > 1 && m_needToNormalizeMomentumByParallUtterance)
-                actualMinibatchSize = chosenMinibatchSize * trainSetDataReader->NumberSlicesInEachRecurrentIter();
+            if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
+                actualMinibatchSize = chosenMinibatchSize * trainSetDataReader->GetNumParallelSequences();
 
             fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
                     i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], actualMinibatchSize));
@@ -1307,8 +1307,8 @@ template<class ElemType>
             ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
             net.SetActualMiniBatchSizeFromFeatures();
-            net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
 
             // TODO: Exactly this loop should be INSIDE ComputationNetwork--pass the nodes array instead!
             for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
@@ -1766,8 +1766,8 @@ template<class ElemType>
                 LogicError("no output node was found.");
 
             net.SetActualMiniBatchSizeFromFeatures();
-            net.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
             trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
             net.Evaluate(outputNodes[0]);   // Only evaluate the first output
             trainSetDataReader->SetNetOutput(uttInfo,
                                              dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[0])->FunctionValues(),
@@ -1922,7 +1922,7 @@ template<class ElemType>
             size_t actualMBSize = 0;
             if (wasDataRead)
             {
-                size_t nSlices = trainSetDataReader->NumberSlicesInEachRecurrentIter();
+                size_t nSlices = trainSetDataReader->GetNumParallelSequences();
                 MBLayoutPtr pMBLayout;
                 if (!useDistributedMBReading && useParallelTrain)
                 {
@@ -1946,14 +1946,14 @@ template<class ElemType>
                 {
                     if (!useDistributedMBReading && useParallelTrain && trainSetDataReader->RequireSentenceSeg())
                     {
-                        net.SetActualNbrSlicesInEachRecurentIteration(nSlices);
                         *net.GetMBLayoutPtr() = *pMBLayout;
                         // TODO: ^^ we should just pass pointers; this current code is semantically identical to before the change to MBLayout
+                        net.VerifyActualNumParallelSequences(nSlices);
                     }
                     else
                     {
-                        net.SetActualNbrSlicesInEachRecurentIteration(nSlices);
                         trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+                        net.VerifyActualNumParallelSequences(nSlices);
                     }
 
                     nSamplesSinceLastModelSync += actualMBSize;
@@ -1969,8 +1969,9 @@ template<class ElemType>
                     if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
                     {
                         refNet.SetActualMiniBatchSize(actualMBSize);
-                        refNet.SetActualNbrSlicesInEachRecurentIteration(trainSetDataReader->NumberSlicesInEachRecurrentIter());
-                        // TODO: not setting MBLayout?
+                        *refNet.GetMBLayoutPtr() = *net.GetMBLayoutPtr();       // TODO: This is UNTESTED (before this was missing, seemingly inconsistently)
+                        refNet.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+
                         refNet.Evaluate(refNode);
                         Matrix<ElemType>::ScaleAndAdd((ElemType)m_adaptationRegWeight,
                                                       dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->FunctionValues(),
diff --git a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
index 6306903b36b1..b0e35552cea9 100644
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@@ -128,8 +128,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
                 actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
-                m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
                 dataReader->CopyMBLayoutTo(m_net.GetMBLayoutPtr());
+                m_net.VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
 
                 //for now since we share the same label masking flag we call this on one node only
                 //Later, when we apply different labels on different nodes
@@ -447,8 +447,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     actualMBSize = (*ptr)->SetActualMiniBatchSizeFromFeatures();
                     if (actualMBSize == 0)
                         LogicError("decoderTrainSetDataReader read data but encoderNet reports no data read");
-                    (*ptr)->SetActualNbrSlicesInEachRecurentIteration((*preader)->NumberSlicesInEachRecurrentIter());
                     (*preader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
+                    (*ptr)->VerifyActualNumParallelSequences((*preader)->GetNumParallelSequences());
 
                     const auto & pairs = (*ptr)->PairNodes();
                     for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++)
@@ -460,8 +460,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 actualMBSize = decoderNet->SetActualMiniBatchSizeFromFeatures();
                 if (actualMBSize == 0)
                     LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read");
-                decoderNet->SetActualNbrSlicesInEachRecurentIteration(decoderDataReader->NumberSlicesInEachRecurrentIter());
                 decoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
+                decoderNet->VerifyActualNumParallelSequences(decoderDataReader->GetNumParallelSequences());
 
                 size_t i = 0;
                 assert(decoderEvaluationNodes.size() == 1);
@@ -624,7 +624,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (auto ptr = readers.begin(); ptr != readers.end(); ptr++)
             {
                 (*ptr)->StartMinibatchLoop(mbSize, 0, testSize);
-                (*ptr)->SetNbrSlicesEachRecurrentIter(1);
+                (*ptr)->SetNumParallelSequences(1);
             }
 
             Matrix<ElemType> historyMat(m_net.GetDeviceId());
@@ -660,9 +660,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     /// evaluate on the encoder networks
                     actualMBSize = (*ptr)->SetActualMiniBatchSizeFromFeatures();
 
-                    mNutt = (*ptrreader)->NumberSlicesInEachRecurrentIter();
-                    (*ptr)->SetActualNbrSlicesInEachRecurentIteration(mNutt);
+                    mNutt = (*ptrreader)->GetNumParallelSequences();
                     (*ptrreader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
+                    (*ptr)->VerifyActualNumParallelSequences(mNutt);
 
                     const auto & pairs = (*ptr)->PairNodes();
                     for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++)
@@ -673,8 +673,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 /// not the sentence begining, because the initial hidden layer activity is from the encoder network
                 decoderNet->SetActualMiniBatchSize(actualMBSize);
-                decoderNet->SetActualNbrSlicesInEachRecurentIteration(mNutt);
                 encoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
+                decoderNet->VerifyActualNumParallelSequences(mNutt);
 
                 FindBestPathWithVariableLength(decoderNet, actualMBSize, decoderDataReader, dataWriter, outputNodes, writeNodes, decoderFeatureNodes, beam, &decoderInputMatrices, best_path);
 
@@ -735,8 +735,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
             {
                 ComputationNodeBasePtr node = *nodeIter;
-                node->EvaluateThisNode(FrameRange(atTime, node->GetNbrSlicesInEachRecurrentIteration()));
-                if (node->GetNumCols() != node->GetNbrSlicesInEachRecurrentIteration())
+                node->EvaluateThisNode(FrameRange(atTime, node->GetNumParallelSequences()));
+                if (node->GetNumCols() != node->GetNumParallelSequences())
                     RuntimeError("preComputeActivityAtTime: the function values has to be a single column matrix ");
             }
         }
@@ -828,7 +828,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t actualMBSize = 0;
 
             dataReader->StartMinibatchLoop(mbSize, 0, testSize);
-            dataReader->SetNbrSlicesEachRecurrentIter(1);
+            dataReader->SetNumParallelSequences(1);
 
             startReadMBTime = clock();
             size_t numMBsRun = 0;
@@ -900,8 +900,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             /// use reader to initialize evalnet's sentence start information to let it know that this
             /// is the begining of sentence
             evalnet->SetActualMiniBatchSize(mbSize);
-            evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
             dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
+            evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
 
             clock_t start, now;
             start = clock();
@@ -1065,8 +1065,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             /// use reader to initialize evalnet's sentence start information to let it know that this
             /// is the beginning of sentence
             evalnet->SetActualMiniBatchSize(mbSize);
-            evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
             // TODO: not setting MBLayout?
+            evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
+            // TODO: This is UNTESTED; if it fails, change ^^ this back to SetActual...()
 
             clock_t start, now;
             start = clock();
@@ -1088,7 +1089,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
             /// is the begining of sentence
             // BUGBUG: This is almost certainly wrong; slice != MB size
-            evalnet->SetActualMiniBatchSize(dataReader->NumberSlicesInEachRecurrentIter());
+            evalnet->SetActualMiniBatchSize(dataReader->GetNumParallelSequences());
 
             double best_score = -numeric_limits<double>::infinity();
             double best_score_so_far = -numeric_limits<double>::infinity();
diff --git a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
index 9adb3184c820..e7e94f15567c 100644
--- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
+++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
@@ -62,7 +62,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             //evaluate with minibatches
             dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples);
-            dataReader.SetNbrSlicesEachRecurrentIter(1);
+            dataReader.SetNumParallelSequences(1);
 
             size_t totalEpochSamples = 0;
             std::map<std::wstring, void *, nocase_compare> outputMatrices;
@@ -73,8 +73,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
 
                 size_t actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
-                m_net.SetActualNbrSlicesInEachRecurentIteration(dataReader.NumberSlicesInEachRecurrentIter());
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
+                m_net.VerifyActualNumParallelSequences(dataReader.GetNumParallelSequences());
 
                 for (int i=0; i<outputNodes.size(); i++)
                 {
@@ -155,6 +155,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 size_t actualMBSize = m_net.SetActualMiniBatchSizeFromFeatures();
                 dataReader.CopyMBLayoutTo(m_net.GetMBLayoutPtr());
+                m_net.VerifyActualNumParallelSequences(dataReader.GetNumParallelSequences());  // TODO: This was added by my (fseide) but UNTESTED. If this fails, comment out and let me know.
 
                 for (int i=0; i<outputNodes.size(); i++)
                 {
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 317e7387e6bf..224835b5f665 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -518,7 +518,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // MBLayout -- layout information of minibatch
     // Currently this is to bind the two somewhat inconsistent boundary flags and packing flags.
     // Once that is unified, we can clean it up further. For now, it's just moving the data members and encapsulating access to them where possible.
-    // This should probably also contain m_actualNbrSlicesInEachRecIter (which should be node-dependent).
+    // This should probably also contain m_actualNumParallelSequencesInEachRecIter (which should be node-dependent).
     // TODO: move this to an appropriate place and name it properly
     // NOTE: This class represents an abstraction of an originally distributed/code-duped way of defining and accessing the MB layout.
     //       The code below represents the actual use cases I encountered. Not all are, I believe, needed to be as they are; this class could be simplified/streamlined much further.
@@ -598,19 +598,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // test a pre-condition  --TODO: we only resize this thing here, so this should not be necessary in the future
-        void validate() const { if (m_minibatchPackingFlags.size() != m_sentenceBoundaryFlags.GetNumCols()) LogicError("MBLayout: GetSize() != GetNumFrames()"); }
+        void validate() const { if (m_minibatchPackingFlags.size() != m_sentenceBoundaryFlags.GetNumCols()) LogicError("MBLayout: GetSize() != GetNumTimeSteps()"); }
 
         // these accessors were for now just collected from actual usage; need to be cleaned up once this compiles again
-        size_t GetNumFrames()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
-        size_t GetNumStreams() const { return IsAllNone() ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
+        size_t GetNumTimeSteps()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
+        size_t GetNumParallelSequences() const { return IsAllNone() ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
         size_t GetSize() const { validate(); return m_minibatchPackingFlags.size(); }
-        // ^^ TODO: add a check whether Size() == GetNumFrames(); it really should, unless I misunderstood
+        // ^^ TODO: add a check whether Size() == GetNumTimeSteps(); it really should, unless I misunderstood
         bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
 #if 0   // we have this pattern often:
         // TODO: mbSize and #slices must also move into MBLayout 
         evalnet->SetActualMiniBatchSize(mbSize);
-        evalnet->SetActualNbrSlicesInEachRecurentIteration(dataReader->NumberSlicesInEachRecurrentIter());
         dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
+        evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
 #endif
 #if 0   // a VERY TELLING piece of code
         // packing flags = frame-wise or over all streams of start and end

From 7e10b57b8eb56484b39b0e3f321ac58d0c561e3e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 17:47:44 -0700
Subject: [PATCH 08/44] changed all read accesses to
 ComputationNode::m_samplesInRecurrentStep to GetNumParallelSequences()--480+
 changes! This is in prep to remove that member variable completely, replacing
 it solely by pMBLayout (we will first add a check that both are the same)

---
 .../CompositeComputationNodes.h               |  24 +-
 .../ComputationNode.h                         |   6 +-
 .../ConvolutionalNodes.h                      |  22 +-
 .../InputAndParamNodes.h                      |  20 +-
 .../LinearAlgebraNodes.h                      | 246 +++++++++---------
 .../NonlinearityNodes.h                       | 124 ++++-----
 .../RecurrentNodes.h                          |  98 +++----
 .../TrainingCriterionNodes.h                  |   8 +-
 Math/Math/Matrix.h                            |   6 +-
 9 files changed, 276 insertions(+), 278 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index c6adb948b20d..251e24a388ca 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,10 +539,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep,
-                                                                                        m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep,
-                                                                             m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -692,8 +690,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -839,13 +837,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             assert(m_memory.GetNumCols() > 0);
 
-            //FunctionValues().Resize(m_memory.GetNumRows(), m_samplesInRecurrentStep);
+            //FunctionValues().Resize(m_memory.GetNumRows(), GetNumParallelSequences());
             FunctionValues().Resize(m_memory.GetNumRows(), frameRange.NumCols());   // extra space for one time step
             if (frameRange.t() == 0)    // for first frame, check that we got all in memory  --TODO: is this comment correct? How about going backwards?
-                assert(FunctionValues().FrameSlice(FrameRange(0, m_samplesInRecurrentStep)/*TODO: delete the next two parameters*/, 0, m_samplesInRecurrentStep).FrobeniusNorm() == m_memory.FrameSlice(FrameRange(0, m_samplesInRecurrentStep)/*TODO: delete the next two parameters*/, 0, m_samplesInRecurrentStep).FrobeniusNorm());
-                //assert(FunctionValues().ColumnSlice(0, m_samplesInRecurrentStep).FrobeniusNorm() == m_memory.ColumnSlice(0, m_samplesInRecurrentStep).FrobeniusNorm());
-            FunctionValues().SetValue(m_memory.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep));
-            assert(FunctionValues().GetNumCols() == m_samplesInRecurrentStep);
+                assert(FunctionValues().FrameSlice(FrameRange(0, GetNumParallelSequences())/*TODO: delete the next two parameters*/, 0, GetNumParallelSequences()).FrobeniusNorm() == m_memory.FrameSlice(FrameRange(0, GetNumParallelSequences())/*TODO: delete the next two parameters*/, 0, GetNumParallelSequences()).FrobeniusNorm());
+                //assert(FunctionValues().ColumnSlice(0, GetNumParallelSequences()).FrobeniusNorm() == m_memory.ColumnSlice(0, GetNumParallelSequences()).FrobeniusNorm());
+            FunctionValues().SetValue(m_memory.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()));
+            assert(FunctionValues().GetNumCols() == GetNumParallelSequences());
         }
 
         virtual void SaveToFile(File& fstream)  const
@@ -934,7 +932,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("TimeReverse operation only takes one input.");
             ComputationNodePtr child = Inputs(inputIndex);
-            ComputeInputPartialS(GradientValues(), child->GradientValues(), m_samplesInRecurrentStep);
+            ComputeInputPartialS(GradientValues(), child->GradientValues(), GetNumParallelSequences());
         }
 
         static void WINAPI ComputeInputPartialS(Matrix<ElemType>& gradientValues, Matrix<ElemType>& inputGradientValues, int nSamples)
@@ -967,7 +965,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (m_hasComputed == false)
             {
-                EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), m_samplesInRecurrentStep);
+                EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), GetNumParallelSequences());
                 m_memory.SetValue(FunctionValues());
             }
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index b3542c092fe5..8b5f0265f403 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -256,7 +256,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Note: only used in one place, SimpleEvaluator.h PreComputeActivityAtTime().
         // The member is, however, read out at 284 places inside nodes,
         // most of the time as
-        // FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep)
+        // FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences())
         // This expression will be turned into a function call to right here, so that we compute this only at one place
         // and can also handle the full-minibatch case.
         // Let us try to get this member out of this class altogether; it belongs elsewhere.
@@ -861,7 +861,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         /*implement*/void EvaluateThisNodeGivenInputs(const size_t timeIdxInSeq) // TODO: change to FrameRange as well
         {
-            EvaluateThisNode(FrameRange(timeIdxInSeq, m_samplesInRecurrentStep));
+            EvaluateThisNode(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
             if (!UseCustomizedMultiSeqHandling())
                 MaskToZeroWhenLabelAndFeatureMissing(m_functionValues, timeIdxInSeq);
@@ -1081,7 +1081,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         (msra::strfun::utf8 (child->OperationName())).c_str(),
                         (msra::strfun::utf8 (child->NodeName())).c_str());
 #endif              
-                    ComputeInputPartial(i, FrameRange(timeIdxInSeq, m_samplesInRecurrentStep)); //this computes partial wrt to the child and sums the gradient value in the child
+                    ComputeInputPartial(i, FrameRange(timeIdxInSeq, GetNumParallelSequences())); //this computes partial wrt to the child and sums the gradient value in the child
                 }
 #ifdef DISPLAY_DEBUG
                 else fprintf (stderr, "    [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index fe3cc5c47a0c..8d591bfe9d94 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -111,14 +111,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("Convolution operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,8 +215,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
 
@@ -433,11 +433,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
         }
@@ -447,8 +447,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 1be91c8ef041..1639a55e6e55 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -330,15 +330,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -384,8 +384,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -555,8 +555,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_functionValues.GetNumRows() == GradientValues().GetNumRows()); // original used m_functionValues.GetNumRows() for loop dimension
             assert(m_pMBLayout);
 
-            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType>::ScaleAndAdd(1.0, GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep), mTmp);
+            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType>::ScaleAndAdd(1.0, GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), mTmp);
         }
 
         virtual void EvaluateThisNode()
@@ -566,8 +566,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            mTmp.SetValue(Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep));
+            Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            mTmp.SetValue(Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index e5abadebf623..ebc7eef2d904 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,8 +53,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -71,8 +71,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
 
@@ -138,8 +138,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -156,8 +156,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -233,8 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -251,8 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -370,8 +370,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
         }
@@ -388,8 +388,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
         }
@@ -486,8 +486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex >= ChildrenSize())
                 InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
         }
@@ -504,9 +504,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceFunctionValues = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceFunctionValues = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
         }
 
         static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
@@ -623,15 +623,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //left Node must be a scalar
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -654,8 +654,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -742,15 +742,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -801,8 +801,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -915,15 +915,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -969,8 +969,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -1073,10 +1073,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad);
         }
@@ -1100,9 +1100,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1202,10 +1202,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("RowElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)
             {
@@ -1252,9 +1252,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1353,17 +1353,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ColumnElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,8 +1403,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
         }
@@ -1509,13 +1509,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (cols0 >= cols1)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialS(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
             }
@@ -1584,25 +1584,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1780,11 +1780,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             Matrix<ElemType> ones = Matrix<ElemType>();
 
@@ -1890,25 +1890,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2048,16 +2048,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("DiagTimes operation only takes two inputs.");
 
             //left parameter (diag matix cannot be sliced)
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,8 +2083,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
         }
@@ -2205,11 +2205,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)  //left derivative
             {
@@ -2280,9 +2280,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
         }
@@ -2426,19 +2426,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("KhatriRaoProduct operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,9 +2461,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
         }
@@ -2564,11 +2564,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(inputIndex, m_invNorm0, m_invNorm1, sliceOutputValue, m_temp, m_rightTerm, m_leftTerm, m_invNormSquare, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), sliceInputGrad, sliceThisGrad);
         }
@@ -2681,9 +2681,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
         }
@@ -2961,25 +2961,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("StrideTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (m_StrideDim == 1) /// column stride
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
 
                     Matrix<ElemType> mTmp1(sliceInput1Value.GetDeviceId());
                     size_t r = Inputs(0)->FunctionValues().GetNumRows();
-                    size_t T1 = Inputs(0)->FunctionValues().GetNumCols() / m_samplesInRecurrentStep;
+                    size_t T1 = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
                     mTmp1.Resize(r, T1);
                     Matrix<ElemType> mTmp2(sliceInput1Value.GetDeviceId());
                     Matrix<ElemType> mTmp3(sliceInput1Value.GetDeviceId());
 
-                    for (size_t k = 0; k < m_samplesInRecurrentStep; k++)
+                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
                         mTmp1.SetValue(0);
                         mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
@@ -2989,25 +2989,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                         for (size_t t = 0; t < T1; t++)
                         {
-                            Inputs(0)->GradientValues().ColumnSlice(t*m_samplesInRecurrentStep + k, 1) += mTmp1.ColumnSlice(t, 1);
+                            Inputs(0)->GradientValues().ColumnSlice(t*GetNumParallelSequences() + k, 1) += mTmp1.ColumnSlice(t, 1);
                         }
                     }
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
-                    for (size_t k = 0; k < m_samplesInRecurrentStep; k++)
+                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
                         Matrix<ElemType> mTmp1(sliceOutputGrad.GetDeviceId());
                         size_t r = Inputs(0)->FunctionValues().GetNumRows();
-                        size_t T1 = Inputs(0)->FunctionValues().GetNumCols() / m_samplesInRecurrentStep;
+                        size_t T1 = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
                         mTmp1.Resize(r, T1);
                         for (size_t t = 0; t < T1; t++)
                         {
-                            mTmp1.ColumnSlice(t, 1).SetValue(Inputs(0)->FunctionValues().ColumnSlice(t*m_samplesInRecurrentStep + k, 1));
+                            mTmp1.ColumnSlice(t, 1).SetValue(Inputs(0)->FunctionValues().ColumnSlice(t*GetNumParallelSequences() + k, 1));
                         }
                         Matrix<ElemType> mTmp2(sliceOutputGrad.GetDeviceId());
                         mTmp2 = sliceInput1Grad.ColumnSlice(k, 1);
@@ -3022,13 +3022,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-                    for (size_t k = 0; k < m_samplesInRecurrentStep; k++)
+                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
                         Matrix<ElemType> mTmp1(sliceInput1Value.GetDeviceId());
                         size_t d = Inputs(1)->FunctionValues().GetNumRows();
-                        size_t T1 = Inputs(0)->FunctionValues().GetNumRows() / m_samplesInRecurrentStep;
+                        size_t T1 = Inputs(0)->FunctionValues().GetNumRows() / GetNumParallelSequences();
                         mTmp1.Resize(d, T1);
                         Matrix<ElemType> mTmp2(sliceInput1Value.GetDeviceId());
                         mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
@@ -3041,18 +3041,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             mTmp4 = mTmp1.ColumnSlice(t, 1);
                             mTmp4.Reshape(1, d);
-                            Inputs(0)->GradientValues().AddToRowSliceValuesOf(mTmp4, t*m_samplesInRecurrentStep + k, 1);
+                            Inputs(0)->GradientValues().AddToRowSliceValuesOf(mTmp4, t*GetNumParallelSequences() + k, 1);
                         }
                     }
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-                    for (size_t k = 0; k < m_samplesInRecurrentStep; k++)
+                    for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
                         size_t d = Inputs(1)->FunctionValues().GetNumRows();
-                        size_t T1 = Inputs(0)->FunctionValues().GetNumRows() / m_samplesInRecurrentStep;
+                        size_t T1 = Inputs(0)->FunctionValues().GetNumRows() / GetNumParallelSequences();
 
                         Matrix<ElemType> mTmp0(sliceOutputGrad.GetDeviceId());
                         mTmp0.Resize(1, d);
@@ -3062,7 +3062,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         for (size_t t = 0; t < T1; t++)
                         {
                             mTmp0.SetValue(0);
-                            mTmp0.AddWithRowSliceValuesOf(Inputs(0)->FunctionValues(), t * m_samplesInRecurrentStep + k, 1);
+                            mTmp0.AddWithRowSliceValuesOf(Inputs(0)->FunctionValues(), t * GetNumParallelSequences() + k, 1);
                             mTmp1.AssignToRowSliceValuesOf(mTmp0, t, 1);
                         }
                         Matrix<ElemType> mTmp2(sliceOutputGrad.GetDeviceId());
@@ -3112,7 +3112,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             UpdateStride(Inputs(1)->FunctionValues());
             if (m_StrideDim == 0)
-                FunctionValues().Resize(rows0 / m_samplesInRecurrentStep, cols1);
+                FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
             if (m_StrideDim == 1)
                 FunctionValues().Resize(rows0, cols1);
 
@@ -3127,13 +3127,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
-                FunctionValues().Resize(rows0 / m_samplesInRecurrentStep, cols1);
+                FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
             if (m_StrideDim == 1)
                 FunctionValues().Resize(rows0, cols1);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_Stride, m_StrideDim);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index c481f8406ce3..eca814210627 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,8 +81,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
         }
@@ -206,10 +206,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -265,10 +265,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,10 +508,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, m_diff, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -616,10 +616,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(m_gradient, m_softmax, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -727,8 +727,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             //get the right slice 
             const size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceGradientValue = m_gradientValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceGradientValue = m_gradientValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                 
             switch (inputIndex)
             {
@@ -738,40 +738,40 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                        Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                        Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 1:
                 {
-                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                       if (colsPrior == 1)
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 2:
                 {
-                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                     if (colsPrior == 1)
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 3:
                 {
-                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -888,11 +888,11 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
 
             //get the right slice 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             if (colsPrior == 1)
             {
@@ -901,12 +901,12 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
-                Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
                 EvaluateThisNodeS(sliceOutputValue, sliceUnnormedPrior, sliceMean, sliceLogstddev, sliceFeature,
                     slicePrior, sliceStddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
@@ -1113,13 +1113,13 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
             if (m_dropoutRate > 0)
             {
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             }
 
             ComputeInputPartialS(m_dropoutRate, sliceInput0Grad, sliceMask, sliceOutputGrad);
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1151,10 +1151,10 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             {
                 FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
                 m_maskOfDropout.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             }
 
-            sliceOutputValue = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            sliceOutputValue = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
         }
@@ -1399,13 +1399,13 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             size_t rows = Inputs(0)->FunctionValues().GetNumRows();
-            if ((rows * m_samplesInRecurrentStep) % m_numRows > 0)
+            if ((rows * GetNumParallelSequences()) % m_numRows > 0)
             {
                 LogicError("Reshape operation: Number of elements in the recurrent input step is not a multiple of the specified number of rows.");
             }
 
-            size_t outputSamplesInRecurrentStep = m_samplesInRecurrentStep * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRows);
@@ -1442,14 +1442,14 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                 InvalidArgument("Reshape operation only takes one input.");
 
             size_t rows = Inputs(0)->GradientValues().GetNumRows();
-            if ((rows * m_samplesInRecurrentStep) % m_numRows > 0)
+            if ((rows * GetNumParallelSequences()) % m_numRows > 0)
             {
                 LogicError("Reshape operation: Number of elements in the recurrent input step is not a multiple of the specified number of rows.");
             }
 
-            size_t outputSamplesInRecurrentStep = m_samplesInRecurrentStep * rows / m_numRows;
+            size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRows);
@@ -1646,8 +1646,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
         }
@@ -1673,8 +1673,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index df94ccecfe7f..aaba256771d2 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -374,12 +374,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0) // TODO: is this check necessary here? Can this be a generic check in the base class?
                 InvalidArgument("PastValue and FutureValue operations only take one input.");
 
-            int nbrSamples = GradientValues().GetNumCols() / m_samplesInRecurrentStep; 
+            int nbrSamples = GradientValues().GetNumCols() / GetNumParallelSequences(); 
             for (int timeIdxInSeq = nbrSamples - 1; timeIdxInSeq >= 0; timeIdxInSeq--)
             {
                 // TODO: call the looping version below to avoid code dup
                 const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, m_samplesInRecurrentStep), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
+                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
             }
         }
 
@@ -388,12 +388,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             assert(m_timeStep > 0);
 
-            int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / m_samplesInRecurrentStep;
+            int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
             for (int timeIdxInSeq = 0; timeIdxInSeq < nbrSamples; timeIdxInSeq++)
             {
                 // TODO: call the looping version below to avoid code dup
                 const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, m_samplesInRecurrentStep), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
+                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
             }
 
             //set the past activity to be used by next minibatch
@@ -444,12 +444,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("PastValue and FutureValue operations only take one input.");
 
-            int nbrSamples = GradientValues().GetNumCols() / m_samplesInRecurrentStep;
+            int nbrSamples = GradientValues().GetNumCols() / GetNumParallelSequences();
             for (int timeIdxInSeq = 0; timeIdxInSeq < nbrSamples; timeIdxInSeq++)
             {
                 // TODO: call the looping version below to avoid code dup
                 const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, m_samplesInRecurrentStep), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
+                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
             }
         }
 
@@ -457,11 +457,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             assert(m_timeStep > 0);
 
-            int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / m_samplesInRecurrentStep;
+            int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
             for (int timeIdxInSeq = nbrSamples - 1; timeIdxInSeq >= 0; timeIdxInSeq--)
             {
                 const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, m_samplesInRecurrentStep), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
+                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
             }
 
             //set the future activity to be used by next minibatch
@@ -472,7 +472,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             assert(m_pMBLayout);
 
-            if (frameRange.t() == Inputs(0)->FunctionValues().GetNumCols() / m_samplesInRecurrentStep - 1)
+            if (frameRange.t() == Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences() - 1)
                 m_delayedActivation = Inputs(0)->FunctionValues();
 
             const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(frameRange.t());
@@ -592,8 +592,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Matrix<ElemType> slicePrevOutput(m_deviceId), slicePrevState(m_deviceId);
                 Matrix<ElemType> grdToPrevOutput(m_deviceId), grdToPrevState(m_deviceId);
                 Matrix<ElemType> stateError(m_deviceId);
-                slicePrevState.Resize(outputDim, m_samplesInRecurrentStep);
-                slicePrevOutput.Resize(outputDim, m_samplesInRecurrentStep);
+                slicePrevState.Resize(outputDim, GetNumParallelSequences());
+                slicePrevOutput.Resize(outputDim, GetNumParallelSequences());
                 slicePrevOutput.SetValue(0);
 
                 stateError.Resize(slicePrevState.GetNumRows(), slicePrevState.GetNumCols());
@@ -603,21 +603,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 grdToPrevOutput.SetValue(0);
                 grdToPrevState.SetValue(0);
 
-                for (int timeIdxInSeq = nT - m_samplesInRecurrentStep; timeIdxInSeq >= 0; timeIdxInSeq -= m_samplesInRecurrentStep)
+                for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
-                    FrameRange frameRange(timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
+                    FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
 
-                    Matrix<ElemType> error = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep);
+                    Matrix<ElemType> error = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
 
                     Matrix<ElemType> grdToObsSlice(this->m_deviceId);
 
@@ -627,7 +627,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif
 
                     PrepareThisErrorsBeforeBackProp(timeIdxInSeq, nT, error, stateError, grdToPrevOutput, grdToPrevState,
-                                                    m_obs_error_from_future_minibatch, m_state_error_from_future_minibatch, m_samplesInRecurrentStep, &m_pMBLayout->GetM());
+                                                    m_obs_error_from_future_minibatch, m_state_error_from_future_minibatch, GetNumParallelSequences(), &m_pMBLayout->GetM());
 
 #ifdef DEBUG_DECODER
                     fprintf(stderr, "output error [%ld] norm = %.8e\n", timeIdxInSeq, error.FrobeniusNorm());
@@ -639,7 +639,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     grdToPrevOutput.SetValue(0);
                     grdToPrevState.SetValue(0);
 
-                    PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, m_samplesInRecurrentStep, m_DefaultState, &m_pMBLayout->GetM());
+                    PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, &m_pMBLayout->GetM());
 
                     ComputeInputGradientWrtGates(
                         error,
@@ -666,9 +666,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         grdToPrevState,
                         m_tempMatrix
                     );
-                    grdToObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, m_samplesInRecurrentStep).SetValue(grdToObsSlice);
+                    grdToObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences()).SetValue(grdToObsSlice);
 
-                    PrepareErrors(timeIdxInSeq, grdToPrevOutput, grdToPrevState, m_samplesInRecurrentStep, &m_pMBLayout->GetM());
+                    PrepareErrors(timeIdxInSeq, grdToPrevOutput, grdToPrevState, GetNumParallelSequences(), &m_pMBLayout->GetM());
                 }
 #ifdef DEBUG_DECODER
                 fprintf(stderr, "after error prop b_c norm = %.8e\n", Inputs(4)->FunctionValues().ColumnSlice(0, 1).FrobeniusNorm());
@@ -917,16 +917,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         */
         int GetSegInfo(size_t t, size_t streamid)
         {
-            if (streamid >= m_samplesInRecurrentStep)
-                LogicError("GetSegInfo: stream id %d is larger than the number of streams %d", streamid, m_samplesInRecurrentStep);
+            if (streamid >= GetNumParallelSequences())
+                LogicError("GetSegInfo: stream id %d is larger than the number of streams %d", streamid, GetNumParallelSequences());
 
             size_t nT = Inputs(0)->FunctionValues().GetNumCols();
             if (t >= nT)
                 LogicError("GetSegInfo: time %d times is larger than the total number of observations %d", t, nT);
 
-            int utt_t = (int)t / m_samplesInRecurrentStep;
+            int utt_t = (int)t / GetNumParallelSequences();
             auto thisCol = m_pMBLayout->GetFrame(utt_t).first;
-            thisCol.Reshape(1, m_samplesInRecurrentStep);
+            thisCol.Reshape(1, GetNumParallelSequences());
             return (int) thisCol.ColumnSlice(streamid, 1).Get00Element();
         }
 
@@ -939,12 +939,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t outputDim = Inputs(1)->FunctionValues().GetNumRows();
             
             // save the hidden activities and output for the next minibatch
-            mLastOutput.Resize(outputDim, m_samplesInRecurrentStep);
-            mLastState.Resize(outputDim, m_samplesInRecurrentStep);
+            mLastOutput.Resize(outputDim, GetNumParallelSequences());
+            mLastState.Resize(outputDim, GetNumParallelSequences());
 
-            for (size_t i = 0; i < m_samplesInRecurrentStep; i++)
+            for (size_t i = 0; i < GetNumParallelSequences(); i++)
             {
-                for (int t = nT - m_samplesInRecurrentStep + i; t >= 0; t -= m_samplesInRecurrentStep)
+                for (int t = nT - GetNumParallelSequences() + i; t >= 0; t -= GetNumParallelSequences())
                 {
                     if (GetSegInfo(t, i) == ((int) MinibatchPackingFlags::None))
                     {
@@ -977,14 +977,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 tanhObs.Resize(outputDim, nT);
                 tanhObs.SetValue(NAN);  // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted. 
 
-                if (m_PastState.IsEmpty() || m_PastState.GetNumCols() != m_samplesInRecurrentStep)
+                if (m_PastState.IsEmpty() || m_PastState.GetNumCols() != GetNumParallelSequences())
                 {
-                    m_PastState.Resize(outputDim, m_samplesInRecurrentStep);
+                    m_PastState.Resize(outputDim, GetNumParallelSequences());
                     m_PastState.SetValue(m_DefaultState);
                 }
-                if (m_PastOutput.IsEmpty() || m_PastOutput.GetNumCols() != m_samplesInRecurrentStep)
+                if (m_PastOutput.IsEmpty() || m_PastOutput.GetNumCols() != GetNumParallelSequences())
                 {
-                    m_PastOutput.Resize(outputDim, m_samplesInRecurrentStep);
+                    m_PastOutput.Resize(outputDim, GetNumParallelSequences());
                 }
 
 #ifdef DEBUG_DECODER
@@ -994,21 +994,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     fprintf(stderr, "LSTM node %ls past state norm = %.8e\n", this->NodeName().c_str(), m_PastState.FrobeniusNorm());
 #endif
 
-                for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += m_samplesInRecurrentStep)
+                for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
-                    FrameRange frameRange(timeIdxInSeq, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
+                    FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceTanhInput = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceTanhInput = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
 
-                    PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, m_samplesInRecurrentStep, m_DefaultState, &m_pMBLayout->GetM());
+                    PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, &m_pMBLayout->GetM());
 
                     EvaluateThisNodeS(Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), Inputs(4)->FunctionValues(),
                             sliceObs, mSlicePrevOutput, mSlicePrevState, sliceOutput, sliceState, sliceGi, sliceGf, sliceGo, sliceTanhState, sliceTanhInput, m_tempMatrix);
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index ddafb73b04de..6f608b529d88 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -1237,8 +1237,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             FunctionValues().SetValue(0.0);
             Matrix<ElemType> funcVal = FunctionValues();
 
-            size_t nstep = ncol / m_samplesInRecurrentStep;
-            for (size_t i = 0; i < m_samplesInRecurrentStep; i++)
+            size_t nstep = ncol / GetNumParallelSequences();
+            for (size_t i = 0; i < GetNumParallelSequences(); i++)
             {
                 Matrix<ElemType> postProbSlice = mPostProb.ColumnSlice(i * nstep, nstep);
                 Matrix<ElemType> alphaSlice = mAlpha.ColumnSlice(i * nstep, nstep);
@@ -1269,9 +1269,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             else if (inputIndex == 2)
             {
                 size_t ncol = mAlpha.GetNumCols();
-                size_t nstep = ncol / m_samplesInRecurrentStep;
+                size_t nstep = ncol / GetNumParallelSequences();
                 assert(Inputs(inputIndex)->GradientValues().GetNumElements() > 0);
-                for (size_t i = 0; i < m_samplesInRecurrentStep; i++)
+                for (size_t i = 0; i < GetNumParallelSequences(); i++)
                 {
                     ErrorSignalToTransitionNode(
                         Inputs(0)->FunctionValues().ColumnSlice(i * nstep, nstep),
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 224835b5f665..9a639422aff3 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -29,10 +29,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // TODO: This may not belong here, but having it in ComputeNode would require syntax changes, while having it as a member here only requires a local find-replace. Let's make it work first, then decide how to refactor.
     // the looping versions of EvaluateThisNode() and ComputeInputPartial() take a frame range, through this structure
     // It can cast from a size_t, i.e. those functions can be called passing a size_t in place of the FrameRange.
-    // TODO: m_samplesInRecurrentStep should be subsumed here & removed from nodes
+    // TODO: GetNumParallelSequences() should be subsumed here & removed from nodes
     // TODO: Where this design currently breaks:
-    //  - BatchModeNodes must access m_samplesInRecurrentStep, yet operate on the whole sequence
-    //  - likewise, LSTMNode does its own iteration, hence needs access to m_samplesInRecurrentStep or NumCols() in the whole-batch iterator
+    //  - BatchModeNodes must access GetNumParallelSequences(), yet operate on the whole sequence
+    //  - likewise, LSTMNode does its own iteration, hence needs access to GetNumParallelSequences() or NumCols() in the whole-batch iterator
     //  - RecurrentNodes access frames with a time shift, where out-of-bounds ones access a different matrix' values
     //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
     //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)

From 42924c43a9efdd128c9da4c93dd5c18094111130 Mon Sep 17 00:00:00 2001
From: erw <erw@microsoft.com>
Date: Fri, 18 Sep 2015 18:06:13 -0700
Subject: [PATCH 09/44] Add ReviseParameter function to MEL so users can revise
 the parameters of a given model.

An example:

> cat example.config
 command=edit
 precision=float
 deviceId=-1
 edit=[
     action=edit
     editPath=example.mel
 ]
> cat example.mel
model1 = LoadModel("lstm.model.100", format=cntk);
ReviseParameter("GlobalPrior", //path/to/prior/vector);
SaveModel(model1, "lstm.model.100.priorfloored");
> cat //path/to/prior/vector
5.2499845e-006
4.88998558e-006
4.89998547e-006
0.000525058422
0.000597978244
...

> $CNTK configFile=example.config
---
 MachineLearning/CNTK/ModelEditLanguage.cpp    | 24 +++++++++++++++++++
 .../InputAndParamNodes.h                      | 18 ++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/MachineLearning/CNTK/ModelEditLanguage.cpp b/MachineLearning/CNTK/ModelEditLanguage.cpp
index 8e6641be5ee5..a31c4f386574 100644
--- a/MachineLearning/CNTK/ModelEditLanguage.cpp
+++ b/MachineLearning/CNTK/ModelEditLanguage.cpp
@@ -592,6 +592,30 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
             netNdlFrom->cn->RenameNode(node, nodeName.second);
         }
     }
+    else if (EqualInsensitive(name, "ReviseParameter"))
+    {
+        typedef LearnableParameter<ElemType> LearnableParameterNode;
+        if (params.size() != 2)
+            RuntimeError("Invalid number of parameters: Valid parameters are: ReviseParameter(nodeName, nodeParametersInASCIIPathName)");
+        std::string nodeName = params[0];
+        std::string paramPath = params[1];
+
+        NetNdl<ElemType>* netNdl; 
+        vector<ComputationNodeBasePtr> nodes = FindSymbols(params[0], netNdl);
+
+        for (auto pNodes : nodes)
+        {
+            if (pNodes->OperationName() != LearnableParameter<ElemType>::TypeName())
+            {
+                fprintf(stderr, "WARNING: you want to change the parameter of node (%ls), but it is not a learnable parameter (it is a %ls node). Skipping this node\n",
+                    pNodes->NodeName().c_str(), pNodes->OperationName().c_str());
+                continue;
+            }
+            shared_ptr<LearnableParameterNode> pParamNode = std::dynamic_pointer_cast<LearnableParameterNode>(pNodes);
+            pParamNode->ReviseFromFile(msra::strfun::mbstowcs(paramPath));
+            fprintf(stderr, "Revise node %ls using parameter file %s\n", pNodes->NodeName().c_str(), paramPath.c_str());
+        }
+    }
     else
     {
         RuntimeError("Unknown Editor function %s", name.c_str());
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 1be91c8ef041..c0c551c31252 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -111,6 +111,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             auto array = File::LoadMatrixFromTextFile<ElemType>(msra::strfun::utf8(initFromFilePath), numRows, numCols); // TODO: change pathname to wstring
             FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, m_deviceId);
         }
+
+        void ReviseFromFile(const std::wstring & reviseFromFilePath)
+        {
+            size_t numRows = 0; 
+            size_t numCols = 0; 
+            auto array = File::LoadMatrixFromTextFile<ElemType>(msra::strfun::utf8(reviseFromFilePath), numRows, numCols); // TODO: change pathname to wstring
+            size_t nRows = m_functionValues.GetNumRows(); 
+            size_t nCols = m_functionValues.GetNumCols(); 
+
+            if (numRows != nRows || numCols != nCols)
+            {
+                RuntimeError("Error in ReviseFromFile for node %ls using file %ls:  original size (%d x %d) vs current size (%d x %d)",
+                    m_nodeName.c_str(), reviseFromFilePath.c_str(), nRows, nCols, numRows, numCols);
+            }
+
+            FunctionValues().SetValue(numRows, numCols, array.data(), matrixFlagNormal, m_deviceId);
+            
+        }
 
         virtual const std::wstring OperationName() const {return TypeName();}
 

From 93d93e0a5abcb93eba4a53be23414d2ecbe35281 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 20:53:24 -0700
Subject: [PATCH 10/44] ComputationNetwork now has a second layout,
 pMBNoLayout, which matches pMBLayout in #sequences but is otherwise empty,
 and used for nodes that do not require sequential processing;
 m_samplesInRecurrentStep now gone from ComputationNode, if needed, the value
 is determined from pMBLayout--yay! One more down; Matrix::SetValue() now
 happily accepts empty matrices (no reason why it should not); (made gc happy
 again)

---
 .../ComputationNetwork.h                      | 22 +++++++++---------
 .../ComputationNode.h                         | 23 ++++++++-----------
 Math/Math/Matrix.cpp                          | 10 ++++----
 Math/Math/Matrix.h                            |  7 ++++--
 Tests/Speech/README.txt                       |  7 +++++-
 5 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index adf9136b9ad5..d251bada48b6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -75,7 +75,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // -----------------------------------------------------------------------
 
     ComputationNetwork(DEVICEID_TYPE deviceId = AUTOPLACEMATRIX) :
-        m_deviceId(deviceId), m_pMBLayout(make_shared<MBLayout>())
+        m_deviceId(deviceId), m_pMBLayout(make_shared<MBLayout>()), m_pMBNoLayout(make_shared<MBLayout>())
     {
         m_randomSeedOffset = 0;
         m_actMiniBSize = 0;
@@ -595,12 +595,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // TODO: rename to ForwardProp()? To make it very clear?
     void Evaluate(const ComputationNodeBasePtr rootNode)
     {
-        // checks that will disappear once we complete the refactoring. If this passes for a while, we will eliminate one
-        // If this fails, comment this out (it is safe) and tell fseide@microsoft.com.
-        if (GetNumParallelSequences() != m_pMBLayout->GetNumParallelSequences())
-            LogicError("Evaluate: detected that m_nbrSlicesInEachRecurrentIteration != m_pMBLayout->GetNumParallelSequences()");
-        if (m_pMBLayout->GetNumTimeSteps() != m_pMBLayout->GetSize())
-            LogicError("Evaluate: detected that m_pMBLayout->GetNumTimeSteps() != m_pMBLayout->GetSize()");
+        // We have a matching layout structure that matches pMBLayout in number of sequences while not having any flags set.
+        // This is used for nodes that do not need recurrent processing, but can be done in batch.
+        // TODO: Does it harm if we have flags, for those that can be done in batch? I.e. why don't we just always provide flags?
+        m_pMBNoLayout->Resize(m_pMBLayout->GetNumParallelSequences(), 0);   // TODO: this is not nice, but we currently have no trigger to detect changes in layout
 
         // prepare to compute with the subnetwork that this rootNode depends on, including
         //  - auto-detecting recurrent loops
@@ -623,10 +621,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // TODO: in the future, these will be different on different nodes
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
-            // TODO: nbrSlices set once to the same value for all nodes each evaluation--is it ever changed later?
-            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences());
             if ((*nodeIter)->ReqMultiSeqHandling())
                 (*nodeIter)->ResetBound(m_pMBLayout);
+            else
+                (*nodeIter)->ResetBound(m_pMBNoLayout);
+            (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
         }
 
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
@@ -723,7 +722,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences()); // TODO: move to FrameRange object
+                            (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences()); // TODO: move to FrameRange object
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -734,7 +733,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                     {
                         for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                         {
-                            (*nodeIter)->SetNumParallelSequences(GetNumParallelSequences());
+                            (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
                         }
                     }
@@ -1580,6 +1579,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // used for sentence boundary information passed from reader to reset RNN state 
     // specify how the minibatch is packed for each sample
     MBLayoutPtr m_pMBLayout;
+    MBLayoutPtr m_pMBNoLayout;  // this one is a dummy, passed when no layout is available/should be used
 
     int m_actMiniBSize;
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 8b5f0265f403..28887e96e55f 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -246,23 +246,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return m_loopId;
         }
 
-        // TODO: these two will disappear once the information is correctly held in a FrameRange record
-        // This is called at 3 places; two are directly before ComputeGradientForChildren().
-        void SetNumParallelSequences(size_t bsz)
+        // temporary function that is called to verify stuff is called as I think it is. Delete if this does not fire for a while.
+        void VerifyNumParallelSequences(size_t bsz)
         {
-            m_samplesInRecurrentStep = bsz;
+            //m_samplesInRecurrentStep = bsz;
+            if (bsz != m_pMBLayout->GetNumParallelSequences())
+                LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
-        // Note: only used in one place, SimpleEvaluator.h PreComputeActivityAtTime().
-        // The member is, however, read out at 284 places inside nodes,
-        // most of the time as
+        // This is used at 284 places inside nodes, most of the time as
         // FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences())
-        // This expression will be turned into a function call to right here, so that we compute this only at one place
-        // and can also handle the full-minibatch case.
-        // Let us try to get this member out of this class altogether; it belongs elsewhere.
         size_t GetNumParallelSequences() const
         {
-            return m_samplesInRecurrentStep;
+            //return m_samplesInRecurrentStep;
+            return m_pMBLayout->GetNumParallelSequences();
         }
 
         int64_t UpdateEvalTimeStamp()
@@ -682,7 +679,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /// the order in reverse graph. 
         int m_visitedOrder;
         int m_index;
-        int m_lowlink;
+        int m_lowlink;          // TODO: comment this, as it is not obvious
         bool m_visited;
         bool m_inStack;
         int m_indexInLoop;
@@ -1281,7 +1278,7 @@ protected:  \
     using Base::m_visitedOrder; using Base::m_index; using Base::m_lowlink; using Base::m_visited; using Base::m_inStack; \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
-    using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; \
+    using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
     using Base::m_children; using Base::m_deviceId; using Base::m_evalTimeStamp; using Base::m_functionValues; using Base::m_gradientValues; \
     using Base::m_inputChannels; using Base::m_inputHeight; using Base::m_inputWidth; using Base::m_needGradient; using Base::m_nodeName; \
     using Base::m_outputChannels; using Base::m_outputHeight; using Base::m_outputWidth; using Base::s_constOnes; using Base::s_timeStampCounter; \
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index 31199feb646a..e122905be16f 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -1005,8 +1005,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void Matrix<ElemType>::SetValue(const ElemType v)
     {
-        if (IsEmpty())
-            throw std::logic_error("SetValue: Matrix is empty.");
+        if (IsEmpty())  // if empty then we are done
+            return;
+            //throw std::logic_error("SetValue: Matrix is empty.");
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
@@ -1020,8 +1021,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void Matrix<ElemType>::SetValue(const DeviceBoundNumber<ElemType>& db_number)
     {
-        if (IsEmpty())
-            throw std::logic_error("SetValue: Matrix is empty.");        
+        if (IsEmpty())  // if empty then we are done
+            return;
+            //throw std::logic_error("SetValue: Matrix is empty.");
 
         DISPATCH_MATRIX_ON_FLAG(this,
             this,
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 9a639422aff3..2fc25539fb26 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -602,10 +602,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // these accessors were for now just collected from actual usage; need to be cleaned up once this compiles again
         size_t GetNumTimeSteps()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
-        size_t GetNumParallelSequences() const { return IsAllNone() ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
+        size_t GetNumParallelSequences() const { return (m_sentenceBoundaryFlags.GetNumRows() == 0) ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
         size_t GetSize() const { validate(); return m_minibatchPackingFlags.size(); }
-        // ^^ TODO: add a check whether Size() == GetNumTimeSteps(); it really should, unless I misunderstood
+
+        // if we have no matrix/vector, this means no frame has any flag set
+        // We still can have a number of rows in this case.
         bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
+
 #if 0   // we have this pattern often:
         // TODO: mbSize and #slices must also move into MBLayout 
         evalnet->SetActualMiniBatchSize(mbSize);
diff --git a/Tests/Speech/README.txt b/Tests/Speech/README.txt
index 93287fcbea3a..58ce4785c15c 100644
--- a/Tests/Speech/README.txt
+++ b/Tests/Speech/README.txt
@@ -26,8 +26,13 @@ bin/cntk configFile=Tests/Speech/QuickE2E/cntk.config RunDir=Tests/Speech/RunDir
 WORKING DIR: $(SolutionDir)Tests\Speech\Data
 COMMAND:     configFile=$(SolutionDir)Tests\Speech\LSTM\cntk.config  stderr=$(SolutionDir)Tests\Speech\RunDir\LSTM\models\cntkSpeech.dnn.log  RunDir=$(SolutionDir)Tests\Speech\RunDir\LSTM  NdlDir=$(SolutionDir)Tests\Speech\LSTM  DataDir=$(SolutionDir)Tests\Speech\Data  DeviceId=Auto
 
+--- MNIST:
+
+WORKING DIR: $(SolutionDir)ExampleSetups\Image\MNIST
+COMMAND:     configFile=02_Conv.config configName=02_Conv
+
+
 Simple test
 -----------
 
-../build/debug/bin/cntk configFile=/home/cbasoglu/src/cntk/.run-linux/Simple.conf
 COMMAND:     configFile=$(SolutionDir)Demos\Simple\Simple.config  stderr=$(SolutionDir)Demos\Simple\RunDir\Simple.config.log  RootDir=$(SolutionDir)  DeviceNumber=-1

From a1173a48ccb3381f4a6e1798902a0237827aff57 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 21:12:13 -0700
Subject: [PATCH 11/44] fixed the spelling/casing of a few recurrence-related
 ComputationNode member accessors

---
 .../ComputationNetwork.cpp                    | 34 ++++----
 .../ComputationNode.h                         | 86 +++++--------------
 .../EvaluationCriterionNodes.h                |  2 +-
 .../LinearAlgebraNodes.h                      | 20 ++---
 4 files changed, 51 insertions(+), 91 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index 960cf8c45d54..681d7d09b6d7 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -408,9 +408,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     void ComputationNetwork::ClearCalcOrderCaches()
     {
-        for (std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>>::iterator it = m_cacheEvalOrders.begin(); it != m_cacheEvalOrders.end(); ++it)
-            for (auto iter2 = m_cacheEvalOrders[it->first].begin(); iter2 != m_cacheEvalOrders[it->first].end(); iter2++)
-                (*iter2)->clearCache();
+        for (auto it : m_cacheEvalOrders)
+            for (auto iter2 : m_cacheEvalOrders[it.first])
+                iter2->ClearCache();
         m_cacheEvalOrders.clear();
         m_cacheGradientCalcOrders.clear();
     }
@@ -419,15 +419,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         /// merge loops if they have the same source node
         std::vector<RecurrentInfo> m_recurrentInfoTmp;
-                    if (m_recurrentInfo.size() <= 1)
-                        return; 
+        if (m_recurrentInfo.size() <= 1)
+            return;
 
         for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
         {
             if (m_recurrentInfoTmp.size() == 0)
             {
                 RecurrentInfo rInfo;
-                            rInfo.Copy(*iter); 
+                rInfo.Copy(*iter);
                 m_recurrentInfoTmp.push_back(rInfo);
             }
             else
@@ -476,7 +476,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         std::list<ComputationNodeBasePtr> sccStack;
         size_t index = 0;
         size_t loopId = 0;
-        if (rootNode->isVisisted() == false)
+        if (rootNode->IsVisisted() == false)
             strongSCC(rootNode, sccStack, index, loopId);
     }
 
@@ -486,7 +486,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        size_t& index, size_t& loopId)
     {
         cur->SetIndex(index);
-        cur->Setlowlink(index);
+        cur->SetLowLink(index);
         index++;
 
         cur->SetVisited(true);
@@ -498,19 +498,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             // pairnetwork is the socket from other network, so ignore its children, which are in the other networks
             for (int i = 0; i < cur->ChildrenSize(); i++)
             {
-                if (cur->GetChildren()[i]->isVisisted() == false)
+                if (cur->GetChildren()[i]->IsVisisted() == false)
                 {
                     strongSCC(cur->GetChildren()[i], sccStack, index, loopId);
-                    cur->Setlowlink(min(cur->Getlowlink(), cur->GetChildren()[i]->Getlowlink()));
+                    cur->SetLowLink(min(cur->GetLowLink(), cur->GetChildren()[i]->GetLowLink()));
                 }
-                else if (cur->GetChildren()[i]->isInStack())
+                else if (cur->GetChildren()[i]->IsInStack())
                 {
-                    cur->Setlowlink(min(cur->Getlowlink(), cur->GetChildren()[i]->Getlowlink()));
+                    cur->SetLowLink(min(cur->GetLowLink(), cur->GetChildren()[i]->GetLowLink()));
                 }
             }
         }
 
-        if (cur->Getlowlink() == cur->GetIndex())   // something special has happened   --TODO: comment what that was!!
+        if (cur->GetLowLink() == cur->GetIndex())   // something special has happened   --TODO: comment what that was!!
         {
             RecurrentInfo rInfo;
             rInfo.m_loopId = loopId;
@@ -549,7 +549,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 cur->OperationName() != OperationNameOf(FutureValueNode))
             {
                 for (size_t i = 0; i < cur->ChildrenSize(); i++)
-                    if (cur->GetChildren()[i]->LoopId() == cur->LoopId())
+                    if (cur->GetChildren()[i]->GetLoopId() == cur->GetLoopId())
                         getLoopForwordOrder(visited, recStack, nodesStack, cur->GetChildren()[i]);
             }
             recStack.erase(cur);
@@ -626,7 +626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputationNodeBasePtr nodeRecIter = (*iter).m_recurrentNodes[j];
                     for (size_t i = 0; i < nodeRecIter->ChildrenSize(); i++)
                     {
-                        if (nodeRecIter->GetChildren()[i]->LoopId() == nodeRecIter->LoopId() && 
+                        if (nodeRecIter->GetChildren()[i]->GetLoopId() == nodeRecIter->GetLoopId() && 
                             nodeRecIter->OperationName() != OperationNameOf(PastValueNode) &&
                             nodeRecIter->OperationName() != OperationNameOf(FutureValueNode))     // TODO: test for type RecurrentNode instead?
                         {
@@ -681,8 +681,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         
         DetermineLoopTypes();
         
-        for (auto iter = nodes.begin(); iter != nodes.end(); iter++)
-            (*iter)->clearCache();
+        for (auto iter : nodes)
+            iter->ClearCache();
     }
 
     void ComputationNetwork::DetermineLoopTypes()
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 28887e96e55f..b873a9612345 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -75,7 +75,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_samplesInRecurrentStep(1),
             m_visitedOrder(-1),
             m_index(-1),
-            m_lowlink(-1),
+            m_lowLink(-1),
             m_indexInLoop(0),
             m_visited(false),
             m_inStack(false),
@@ -172,79 +172,39 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_pMBLayout = pMBLayout;
         }
 
-        void SetLoopId(const int id) { m_loopId = id; }
-        void SetVisitedOrder(const int id) { m_visitedOrder = id; }
-        void SetIndex(const size_t ind) { m_index = ind; }
-
-        void Setlowlink(const size_t lowlink)
-        {
-            m_lowlink = lowlink;
-        }
-
-        void SetVisited(const bool visited)
-        {
-            m_visited = visited;
-        }
-
-        void SetInStack(const bool instack)
-        {
-            m_inStack = instack;
-        }
-
-        void SetIndexInLoop(const size_t index)
-        {
-            m_indexInLoop = index;
-        }
-
-        void clearCache()
+        void ClearCache()
         {
             m_loopId = -1;
             m_visitedOrder = -1;
             m_index = -1;
-            m_lowlink = -1;
+            m_lowLink = -1;
             m_indexInLoop = 0;
             m_visited = false;
             m_inStack = false;
         }
 
-        size_t GetIndex() const
-        {
-            return m_index;
-        }
+        void SetLoopId(const int id) { m_loopId = id; }
+        int GetLoopId() const { return m_loopId; }
 
-        size_t GetVisitedOrder() const
-        {
-            return m_visitedOrder;
-        }
+        void SetVisitedOrder(const int id) { m_visitedOrder = id; }
+        size_t GetVisitedOrder() const { return m_visitedOrder; }
 
-        size_t Getlowlink() const
-        {
-            return m_lowlink;
-        }
+        void SetIndex(const size_t ind) { m_index = ind; }
+        size_t GetIndex() const { return m_index; }
 
-        size_t GetIndexInLoop() const
-        {
-            return m_indexInLoop;
-        }
+        void SetLowLink(const size_t lowlink) { m_lowLink = lowlink; }
+        size_t GetLowLink() const { return m_lowLink; }
 
-        std::wstring GetName() const
-        {
-            return m_nodeName;
-        }
+        void SetVisited(const bool visited) { m_visited = visited; }
+        bool IsVisisted() const { return m_visited; }
 
-        bool isVisisted() const
-        {
-            return m_visited;
-        }
+        void SetInStack(const bool instack) { m_inStack = instack; }
+        bool IsInStack() const { return m_inStack; }
 
-        bool isInStack() const
-        {
-            return m_inStack;
-        }
-        int LoopId() const
-        {
-            return m_loopId;
-        }
+        void SetIndexInLoop(const size_t index) { m_indexInLoop = index; }
+        size_t GetIndexInLoop() const { return m_indexInLoop; }
+
+        std::wstring GetName() const { return m_nodeName; }
 
         // temporary function that is called to verify stuff is called as I think it is. Delete if this does not fire for a while.
         void VerifyNumParallelSequences(size_t bsz)
@@ -523,8 +483,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (!IsLeaf())
                     m_needGradient = ChildrenNeedGradient();  //only nodes that require gradient calculation is included in gradient calculation
 
-                if (LoopId() >= 0)
-                    recurrentResult[LoopId()].push_back(shared_from_this());
+                if (GetLoopId() >= 0)
+                    recurrentResult[GetLoopId()].push_back(shared_from_this());
                 else
                     noRecurrentResult.push_back(shared_from_this());  //we put this in the list even if it's leaf since we need to use it to determine learnable params 
             }
@@ -679,7 +639,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /// the order in reverse graph. 
         int m_visitedOrder;
         int m_index;
-        int m_lowlink;          // TODO: comment this, as it is not obvious
+        int m_lowLink;          // TODO: comment this, as it is not obvious
         bool m_visited;
         bool m_inStack;
         int m_indexInLoop;
@@ -1275,7 +1235,7 @@ public: \
     using Base::SaveToFile; using Base::SetFunctionAndGradientSize; using Base::SetInput; using Base::Validate; \
 protected:  \
     using Base::m_loopId; using Base::m_samplesInRecurrentStep; \
-    using Base::m_visitedOrder; using Base::m_index; using Base::m_lowlink; using Base::m_visited; using Base::m_inStack; \
+    using Base::m_visitedOrder; using Base::m_index; using Base::m_lowLink; using Base::m_visited; using Base::m_inStack; \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
     using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
diff --git a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
index 004c0c449abd..a60ea5c604e1 100644
--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("ErrorPrediction operation: one of the operants has 0 element.");
 
             if (((!(Inputs(0)->FunctionValues().GetNumRows() == Inputs(1)->FunctionValues().GetNumRows()  &&  //match size
-                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )) && Inputs(0)->LoopId() < 0)
+                Inputs(0)->FunctionValues().GetNumCols() == Inputs(1)->FunctionValues().GetNumCols()) )) && Inputs(0)->GetLoopId() < 0)
             {
                 LogicError("The Matrix dimension in the ErrorPrediction operation does not match.");
             }       
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index ebc7eef2d904..69810bfd5674 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -832,22 +832,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
             size_t rows1 = Inputs(1)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            if ((rows0 == 0 || cols1 == 0 ) && this->LoopId() < 0)
+            if ((rows0 == 0 || cols1 == 0 ) && this->GetLoopId() < 0)
                 throw logic_error("Times operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");
 
             // TODO: use dynamic_pointer_cast
             // TODO: why should these nodes even care whether their inputs are LearnableParmaeters? If needed, can the base class do this?
-            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
+            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->GetLoopId() < 0)
                 Inputs(0)->FunctionValues().Resize(rows0, rows1);
 
             if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && cols0 != 0 && rows1 == 0)
                 Inputs(1)->FunctionValues().Resize(cols0, cols1);
 
-            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements())&& this->LoopId() < 0)
+            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements())&& this->GetLoopId() < 0)
                 LogicError("Times operation: One of the operants has 0 elements.");
 
             //cols0 and rows1 may have been changed so don't use them in the following check
-            if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumCols()) && this->LoopId() < 0)
+            if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumCols()) && this->GetLoopId() < 0)
             {
                 LogicError("The Matrix dimension in the Times operation does not match.");
             }
@@ -1000,20 +1000,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
             size_t rows1 = Inputs(1)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            if ((rows0 == 0 || cols1 == 0) && this->LoopId() < 0)
+            if ((rows0 == 0 || cols1 == 0) && this->GetLoopId() < 0)
                 throw logic_error("TransposeTimes operation: Inputs(0)->FunctionValues().GetNumRows() and Inputs(1)->FunctionValues().GetNumCols() should not be 0 since it cannot be automatically inferred");
 
-            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->LoopId() < 0)
+            if ((Inputs(0)->OperationName() == OperationNameOf(LearnableParameter) && cols0 == 0 && rows1 != 0) && this->GetLoopId() < 0)
                 Inputs(0)->FunctionValues().Resize(rows0, rows1);
 
             if (Inputs(1)->OperationName() == OperationNameOf(LearnableParameter) && cols0 != 0 && rows1 == 0)
                 Inputs(1)->FunctionValues().Resize(cols0, cols1);
 
-            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements()) && this->LoopId() < 0)
+            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements()) && this->GetLoopId() < 0)
                 LogicError("TransposeTimes operation: One of the operants has 0 elements.");
 
             //cols0 and rows1 may have been changed so don't use them in the following check
-            if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumRows()) && this->LoopId() < 0)
+            if ((Inputs(1)->FunctionValues().GetNumRows() != Inputs(0)->FunctionValues().GetNumRows()) && this->GetLoopId() < 0)
             {
                 LogicError("The Matrix dimension in the TransposeTimes operation does not match.");
             }
@@ -1679,7 +1679,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Inputs(index)->FunctionValues().Resize(rows, cols);
             }
 
-            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements()) && this->LoopId() < 0)
+            if ((Inputs(0)->FunctionValues().HasNoElements() || Inputs(1)->FunctionValues().HasNoElements()) && this->GetLoopId() < 0)
                 LogicError("Plus operation: one of the operants has 0 element.");
 
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols0 = Inputs(0)->FunctionValues().GetNumCols();
@@ -1689,7 +1689,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 !((rows0 == 1 || rows1 == 1) && cols0 == cols1) && //one is row vec
                 !(  (cols0 > cols1 && cols0 % cols1 == 0) || 
                     (cols0 == 1 && rows1 % rows0 == 0) || 
-                    (cols1 == 1 && rows0 % rows1 == 0))) && this->LoopId() < 0) //one is col vec with divisable rows, including scalar
+                    (cols1 == 1 && rows0 % rows1 == 0))) && this->GetLoopId() < 0) //one is col vec with divisable rows, including scalar
             {
                 LogicError("The Matrix dimension in the Plus operation does not match.");
             }       

From 5b27cda6a8b43929c9ae584bc3fa84454a77eb49 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 21:19:27 -0700
Subject: [PATCH 12/44] (made nvcc/Linux happy)

---
 Math/Math/MatrixQuantizerGPU.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Math/Math/MatrixQuantizerGPU.h b/Math/Math/MatrixQuantizerGPU.h
index 9247bcbf1ce5..0435452cfc6a 100644
--- a/Math/Math/MatrixQuantizerGPU.h
+++ b/Math/Math/MatrixQuantizerGPU.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include "QuantizedMatrix.h"    // TODO: strangely, this must be included first, although it is the first thing MatrixQuantizer.h includes. Without, nvcc fails.
 #include "MatrixQuantizer.h"
-#include "QuantizedMatrix.h"
 #include "ColumnQuantizer.h"
 #include "GPUMatrix.h"
 #ifndef CPUONLY

From 4b7b6855c96f67387c73f11d60f9f0523789de3e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 21:27:12 -0700
Subject: [PATCH 13/44] HTKMLFReader::CopyMBLayoutTo() now resets pMBLayout to
 default in frame mode (instead of leaving it untouched--readers should not
 make assumptions on downstream consumers' defaults)

---
 DataReader/HTKMLFReader/HTKMLFReader.cpp | 3 ++-
 Math/Math/Matrix.h                       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DataReader/HTKMLFReader/HTKMLFReader.cpp b/DataReader/HTKMLFReader/HTKMLFReader.cpp
index ec015a65e51d..8ee76f3152c7 100644
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@@ -1607,7 +1607,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (!m_framemode)
                 *pMBLayout = *m_pMBLayout;
-            // TODO: what about frame mode? Should we create a dummy one? Or Clear() it? Reader should not know what ComputationNetworks' defaults are.
+            else
+                pMBLayout->SetAllNone();    // no flags in frame mode
         }
 
 
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 2fc25539fb26..34eac851b7f8 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -608,6 +608,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // if we have no matrix/vector, this means no frame has any flag set
         // We still can have a number of rows in this case.
         bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
+        void SetAllNone() { Resize(0, 0); }
 
 #if 0   // we have this pattern often:
         // TODO: mbSize and #slices must also move into MBLayout 

From a47889e500d642789faf4d0da3ad1b885f81b972 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 22:14:24 -0700
Subject: [PATCH 14/44] unified the two versions of FindInRecurrentLoops();
 renamed m_actMiniBSize to m_actualMBSize; moved EvaluateLoop() back to
 Evaluate(), no point in having a separate function

---
 .../ComputationNetwork.cpp                    |   4 +-
 .../ComputationNetwork.h                      | 147 ++++++++++--------
 MachineLearning/CNTKSGDLib/SGD.cpp            |   8 +-
 3 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index 681d7d09b6d7..8accfc5dfde5 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -743,10 +743,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         accessed.assign(m_recurrentInfo.size(), false);
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
-            int iId = FindInRecurrentLoop(*nodeIter);
+            const vector<ComputationNodeBasePtr>* pRecurrentNodesDummy;
+            int iId = FindInRecurrentLoops(*nodeIter, pRecurrentNodesDummy);
             if (iId >= 0)
             {
-
                 if (!accessed[iId])
                 {
                     newList.insert(newList.end(),
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index d251bada48b6..b1ea714fd516 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -78,7 +78,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         m_deviceId(deviceId), m_pMBLayout(make_shared<MBLayout>()), m_pMBNoLayout(make_shared<MBLayout>())
     {
         m_randomSeedOffset = 0;
-        m_actMiniBSize = 0;
+        m_actualMBSize = 0;
         SetDeviceId(deviceId);
     }
 
@@ -513,24 +513,26 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // evaluation
     // -----------------------------------------------------------------------
 
-    int FindInRecurrentLoop(const ComputationNodeBasePtr startNode, vector<ComputationNodeBasePtr>& recurrentNodes)
+    // find if node is part of a recurrent loop; and return the loop id
+    // If found then return a pointer to the list of nodes of this loop.
+    // TODO: This should just return &m_recurrentInfo of the matching loop, or nullptr if no match. If needed, m_recurrentInfo knows its loop id.
+    int FindInRecurrentLoops(const ComputationNodeBasePtr node, const vector<ComputationNodeBasePtr>* & pRecurrentNodes) const
     {
-        int iFound = -1;
-
-        for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
+        // look in all recurrent loops of the network
+        for (const auto & iter : m_recurrentInfo)
         {
-            if (std::find((*iter).m_recurrentNodes.begin(), (*iter).m_recurrentNodes.end(), startNode) != (*iter).m_recurrentNodes.end())
+            if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
             {
-                iFound = (*iter).m_loopId;
-                recurrentNodes = (*iter).m_recurrentNodesForForward;
-                break;
+                // found
+                pRecurrentNodes = &iter.m_recurrentNodesForForward;
+                return iter.m_loopId;
             }
         }
-
-        return iFound;
+        return -1;  // not part of a recurrent loop
     }
 
-    int FindInRecurrentLoop(const ComputationNodeBasePtr startNode)
+#if 0
+    int FindInRecurrentLoops(const ComputationNodeBasePtr startNode) const
     {
         int iFound = -1;
 
@@ -545,45 +547,12 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         return iFound;
     }
+#endif
 
     bool IsFuncValueOlderThanInputs(const std::vector<ComputationNodeBasePtr>& recurrentNodes);
 
-    void EvaluateLoop(std::list<ComputationNodeBasePtr>& /*allNodes*/, const ComputationNodeBasePtr startNode)
+    void EvaluateLoop(const ComputationNodeBasePtr startNode)
     {
-        std::vector<ComputationNodeBasePtr> recurrentNodes;
-        int iLoopId = FindInRecurrentLoop(startNode, recurrentNodes);
-        if (iLoopId != -1 && IsFuncValueOlderThanInputs(recurrentNodes) && m_recurrentInfo[iLoopId].m_completedEvaluate == false)
-        {
-            for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
-                (*nodeIter)->SetFunctionAndGradientSize(m_actMiniBSize);
-
-            int iMBSize = m_actMiniBSize / GetNumParallelSequences();
-
-            if (m_recurrentInfo[iLoopId].m_isForwardLoop)
-            {
-                for (int timeIndex = 0; timeIndex < iMBSize; timeIndex ++)
-                {
-                    for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
-                    {
-                        (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
-                        (*nodeIter)->UpdateEvalTimeStamp();
-                    }
-                } 
-            }
-            else
-            {
-                for (int timeIndex = iMBSize-1; timeIndex >= 0; timeIndex--)
-                {
-                    for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
-                    {
-                        (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
-                        (*nodeIter)->UpdateEvalTimeStamp();
-                    }
-                }
-            }
-
-            m_recurrentInfo[iLoopId].m_completedEvaluate = true;
-        }
     }
 
     bool IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr);
@@ -618,7 +587,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             m_recurrentInfo[i].m_completedEvaluate = false;
 
         // pass #slices and MB layout to all nodes
-        // TODO: in the future, these will be different on different nodes
+        // TODO: in the future, these will be different on different nodes; and probably should be propagated by nodes themselves, like functionValues
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
             if ((*nodeIter)->ReqMultiSeqHandling())
@@ -628,14 +597,52 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
         }
 
+        // traverse all nodes in the pre-determined evaluation order
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
-            // TODO: is this the frame-by-frame evaluation? Why is there no comment here??
-            // evaluate all recurrence that hangs off this first
-            EvaluateLoop(allNodes, *nodeIter);
+            // --- first, evaluate all recurrence that hangs off this
+
+            const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;             // set of nodes that participate in same loop as current node, if any
+            int iLoopId = FindInRecurrentLoops(*nodeIter, pRecurrentNodes);   // check if this node participates in a recurrent loop
+
+            if (iLoopId != -1 && IsFuncValueOlderThanInputs(*pRecurrentNodes) && m_recurrentInfo[iLoopId].m_completedEvaluate == false)
+            {
+                // node participates in a recurrent loop: process the loop frame by frame
+                for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
+                    (*nodeIter)->SetFunctionAndGradientSize(m_actualMBSize);
+    
+                const size_t T = m_actualMBSize / GetNumParallelSequences();
+
+                // for every time step run through all nodes in this particular loop
+                if (m_recurrentInfo[iLoopId].m_isForwardLoop)
+                {
+                    for (size_t timeIndex = 0; timeIndex < T; timeIndex ++)
+                    {
+                        for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
+                        {
+                            (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
+                            (*nodeIter)->UpdateEvalTimeStamp();
+                        }
+                    } 
+                }
+                else
+                {
+                    for (size_t timeIndex = T - 1; timeIndex --> 0; )
+                    {
+                        for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
+                        {
+                            (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
+                            (*nodeIter)->UpdateEvalTimeStamp();
+                        }
+                    }
+                }
+    
+                m_recurrentInfo[iLoopId].m_completedEvaluate = true;
+            }
+
+            // --- second, do the whole batch (unless it's already done)
 
-            // now do the whole batch (unless it's already done)
-            if ((*nodeIter)->IsFuncValueOlderThanInputs() && (FindInRecurrentLoop(*nodeIter) == -1))
+            else if (iLoopId == -1 && (*nodeIter)->IsFuncValueOlderThanInputs())
             {
 #ifdef DISPLAY_DEBUG
                 fprintf (stderr, "Evaluate Node: %s\n",(msra::strfun::utf8 ((*nodeIter)->NodeName())).c_str());
@@ -654,9 +661,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // resize entire network to handle a given MB size
     // TODO: actually it only updates nodes in m_recurrentInfo. Why? Because without recurrence, size never changes?
     // TODO: Is this always called with the result of DetermineActualMBSizeFromFeatures()? Why would it ever not?
+    // TODO: the network should know this by itself, no?
     void SetActualMiniBatchSize(const size_t aSize)
     {
-        m_actMiniBSize = (int) aSize;
+        m_actualMBSize = (int) aSize;
 
         // assume that all nodes in recurrent loops need to be reset to aSize minibatch size, so need to reset the following
         for (int i = 0; i < m_recurrentInfo.size(); i++)
@@ -668,7 +676,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // resize function values and gradients of everything in m_recurrentInfo
         for (int i = 0; i < m_recurrentInfo.size(); i++)
             for (auto nodeIter : m_recurrentInfo[i].m_recurrentNodes)
-                nodeIter->SetFunctionAndGradientSize(m_actMiniBSize);
+                nodeIter->SetFunctionAndGradientSize(m_actualMBSize);
     }
 
     // it is used this way most of the time
@@ -681,7 +689,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     // GetMaxMBSize - Get the maximum minibatch size that will be seen in a training run
     // returns the result from SetActualMiniBatchSize(). Note DetermineActualMBSizeFromFeatures() also exists but returns a value derived from the inputs dimensions
-    size_t GetMaxMBSize() { return m_actMiniBSize; }
+    size_t GetMaxMBSize() { return m_actualMBSize; }
 
 #if 0
     // always called in this pattern:
@@ -709,18 +717,18 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     void ComputeGradientLoop(std::list<ComputationNodeBasePtr>& /*allNodes*/, const ComputationNodeBasePtr startNode)
     {
-        std::vector<ComputationNodeBasePtr> recurrentNodes;
-        int iLoopId = FindInRecurrentLoop(startNode, recurrentNodes);
+        const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;
+        int iLoopId = FindInRecurrentLoops(startNode, pRecurrentNodes);
         if (iLoopId != -1)
         {
             if (m_recurrentInfo[iLoopId].m_completedGradient == false)
             {
-                int mbSize = m_actMiniBSize / GetNumParallelSequences();
+                size_t T = m_actualMBSize / GetNumParallelSequences();
                 if (m_recurrentInfo[iLoopId].m_isForwardLoop)
                 {
-                    for (int timeIndex = mbSize - 1; timeIndex >= 0; timeIndex--)
+                    for (size_t timeIndex = T; timeIndex --> 0; )
                     {
-                        for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
+                        for (auto nodeIter = pRecurrentNodes->rbegin(); nodeIter != pRecurrentNodes->rend(); ++nodeIter)
                         {
                             (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences()); // TODO: move to FrameRange object
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
@@ -729,9 +737,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                 }
                 else
                 {
-                    for (int timeIndex = 0; timeIndex < mbSize; timeIndex++)
+                    for (size_t timeIndex = 0; timeIndex < T; timeIndex++)
                     {
-                        for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
+                        for (auto nodeIter = pRecurrentNodes->rbegin(); nodeIter != pRecurrentNodes->rend(); ++nodeIter)
                         {
                             (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
                             (*nodeIter)->ComputeGradientForChildren(timeIndex);
@@ -758,6 +766,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             RuntimeError("ComputeGradient: The root of the Gradient computation must evaluate to R1 value.");
 
         //run forward pass first
+        // TODO: feels out of place; can't we stick for ForwardProp()/BackwardProp()?
         Evaluate(rootNode);
 
         // TODO: comment what the purpose of this is
@@ -1257,11 +1266,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
-            std::vector<ComputationNodeBasePtr> recurrentNodes;
-            int iLoopId = FindInRecurrentLoop(*nodeIter, recurrentNodes);
+            const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;
+            int iLoopId = FindInRecurrentLoops(*nodeIter, pRecurrentNodes);
             if (iLoopId != -1 && m_recurrentInfo[iLoopId].m_completedGradient == false)
             {
-                for (auto nodeIterInLoop = recurrentNodes.rbegin(); nodeIterInLoop != recurrentNodes.rend(); ++nodeIterInLoop)
+                for (auto nodeIterInLoop = pRecurrentNodes->rbegin(); nodeIterInLoop != pRecurrentNodes->rend(); ++nodeIterInLoop)
                     AllocateGradientMatricesForChildren(*nodeIterInLoop, numParents);
                 m_recurrentInfo[iLoopId].m_completedGradient = true;
             }
@@ -1475,7 +1484,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
 
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
-            (*nodeIter)->ClearGradientForChildren(m_actMiniBSize);
+            (*nodeIter)->ClearGradientForChildren(m_actualMBSize);
 
         //for (auto nodeIter = m_recurrentInfo.begin(); nodeIter != m_recurrentInfo.end(); nodeIter++)
         //    (*nodeIter).m_completedGradient = false;
@@ -1579,9 +1588,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // used for sentence boundary information passed from reader to reset RNN state 
     // specify how the minibatch is packed for each sample
     MBLayoutPtr m_pMBLayout;
-    MBLayoutPtr m_pMBNoLayout;  // this one is a dummy, passed when no layout is available/should be used
+    MBLayoutPtr m_pMBNoLayout;  // this alternative one is passed when no layout is available/should be used
 
-    int m_actMiniBSize;
+    int m_actualMBSize;         // current MB size in columns --note: this is not #frames, if we have multiple parallel sequences, cf. MBLayout
 
     // main node holder
     std::map<const std::wstring, ComputationNodeBasePtr, nocase_compare> m_nameToNodeMap;   // [name] -> node; this is the main container that holds this networks' nodes
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index e93b01140b2b..0c1ddeaf9d52 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -1968,7 +1968,13 @@ template<class ElemType>
                     // TODO: currently only support one node regularization
                     if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
                     {
-                        refNet.SetActualMiniBatchSize(actualMBSize);
+#if 1
+                        size_t actualMBSize2 = refNet.SetActualMiniBatchSizeFromFeatures();
+                        if (actualMBSize2 != actualMBSize)
+                            LogicError("TrainOneEpoch: refNet has different MB size than main net??");
+#else
+                        refNet.SetActualMiniBatchSize(actualMBSize);            // TODO: SetActualMiniBatchSizeFromFeatures() should have the same result, no?
+#endif
                         *refNet.GetMBLayoutPtr() = *net.GetMBLayoutPtr();       // TODO: This is UNTESTED (before this was missing, seemingly inconsistently)
                         refNet.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
 

From ae94278f91ab7a2860989bed7fd2f9b02c49d6bc Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 22:36:19 -0700
Subject: [PATCH 15/44] FindInRecurrentLoops() now returns a pointer directly
 to the structure it found, instead of an index--way simpler; merged
 ComputeGradientLoop() into ComputeGradient(), no point in having a separate
 function

---
 .../ComputationNetwork.cpp                    |  10 +-
 .../ComputationNetwork.h                      | 172 ++++++++----------
 .../ComputationNode.h                         |   4 +-
 3 files changed, 78 insertions(+), 108 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index 8accfc5dfde5..5e73afcfe4f1 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -738,15 +738,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         std::list<ComputationNodeBasePtr> vTmp;
         std::list<ComputationNodeBasePtr> vRecurrentTmp;
-        //int  prevId = -1;
-        vector<bool> accessed;
-        accessed.assign(m_recurrentInfo.size(), false);
+        vector<bool> accessed(m_recurrentInfo.size(), false);
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
-            const vector<ComputationNodeBasePtr>* pRecurrentNodesDummy;
-            int iId = FindInRecurrentLoops(*nodeIter, pRecurrentNodesDummy);
-            if (iId >= 0)
+            const RecurrentInfo * recInfo = FindInRecurrentLoops(*nodeIter);
+            if (recInfo)
             {
+                int iId = recInfo->m_loopId;
                 if (!accessed[iId])
                 {
                     newList.insert(newList.end(),
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index b1ea714fd516..107f6f409c77 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -516,45 +516,17 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // find if node is part of a recurrent loop; and return the loop id
     // If found then return a pointer to the list of nodes of this loop.
     // TODO: This should just return &m_recurrentInfo of the matching loop, or nullptr if no match. If needed, m_recurrentInfo knows its loop id.
-    int FindInRecurrentLoops(const ComputationNodeBasePtr node, const vector<ComputationNodeBasePtr>* & pRecurrentNodes) const
+    RecurrentInfo * FindInRecurrentLoops(const ComputationNodeBasePtr node)
     {
         // look in all recurrent loops of the network
-        for (const auto & iter : m_recurrentInfo)
-        {
+        for (auto & iter : m_recurrentInfo)
             if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
-            {
-                // found
-                pRecurrentNodes = &iter.m_recurrentNodesForForward;
-                return iter.m_loopId;
-            }
-        }
-        return -1;  // not part of a recurrent loop
+                return &iter;
+        return nullptr;  // not part of a recurrent loop
     }
 
-#if 0
-    int FindInRecurrentLoops(const ComputationNodeBasePtr startNode) const
-    {
-        int iFound = -1;
-
-        for (auto iter = m_recurrentInfo.begin(); iter != m_recurrentInfo.end(); iter++)
-        {
-            if (std::find((*iter).m_recurrentNodes.begin(), (*iter).m_recurrentNodes.end(), startNode) != (*iter).m_recurrentNodes.end())
-            {
-                iFound = (*iter).m_loopId;
-                break;
-            }
-        }
-
-        return iFound;
-    }
-#endif
-
     bool IsFuncValueOlderThanInputs(const std::vector<ComputationNodeBasePtr>& recurrentNodes);
 
-    void EvaluateLoop(const ComputationNodeBasePtr startNode)
-    {
-    }
-
     bool IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr);
 
     void SetNodesReqMultiSeqHandling();
@@ -602,47 +574,47 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         {
             // --- first, evaluate all recurrence that hangs off this
 
-            const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;             // set of nodes that participate in same loop as current node, if any
-            int iLoopId = FindInRecurrentLoops(*nodeIter, pRecurrentNodes);   // check if this node participates in a recurrent loop
+            RecurrentInfo * recInfo = FindInRecurrentLoops(*nodeIter);   // check if this node participates in a recurrent loop
 
-            if (iLoopId != -1 && IsFuncValueOlderThanInputs(*pRecurrentNodes) && m_recurrentInfo[iLoopId].m_completedEvaluate == false)
+            if (recInfo && IsFuncValueOlderThanInputs(recInfo->m_recurrentNodesForForward) && !recInfo->m_completedEvaluate)
             {
+                const auto & recurrentNodes = recInfo->m_recurrentNodesForForward;
                 // node participates in a recurrent loop: process the loop frame by frame
-                for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
-                    (*nodeIter)->SetFunctionAndGradientSize(m_actualMBSize);
+                for (auto & nodeIter : recurrentNodes)
+                    nodeIter->SetFunctionAndGradientSize(m_actualMBSize);
     
                 const size_t T = m_actualMBSize / GetNumParallelSequences();
 
                 // for every time step run through all nodes in this particular loop
-                if (m_recurrentInfo[iLoopId].m_isForwardLoop)
+                if (recInfo->m_isForwardLoop)
                 {
-                    for (size_t timeIndex = 0; timeIndex < T; timeIndex ++)
+                    for (size_t t = 0; t < T; t ++)
                     {
-                        for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
+                        for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
                         {
-                            (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
+                            (*nodeIter)->EvaluateThisNodeGivenInputs(t);
                             (*nodeIter)->UpdateEvalTimeStamp();
                         }
                     } 
                 }
                 else
                 {
-                    for (size_t timeIndex = T - 1; timeIndex --> 0; )
+                    for (size_t t = T - 1; t --> 0; )
                     {
-                        for (auto nodeIter = pRecurrentNodes->begin(); nodeIter != pRecurrentNodes->end(); nodeIter++)
+                        for (auto nodeIter = recurrentNodes.begin(); nodeIter != recurrentNodes.end(); nodeIter++)
                         {
-                            (*nodeIter)->EvaluateThisNodeGivenInputs(timeIndex);
+                            (*nodeIter)->EvaluateThisNodeGivenInputs(t);
                             (*nodeIter)->UpdateEvalTimeStamp();
                         }
                     }
                 }
     
-                m_recurrentInfo[iLoopId].m_completedEvaluate = true;
+                recInfo->m_completedEvaluate = true;
             }
 
             // --- second, do the whole batch (unless it's already done)
 
-            else if (iLoopId == -1 && (*nodeIter)->IsFuncValueOlderThanInputs())
+            else if (!recInfo && (*nodeIter)->IsFuncValueOlderThanInputs())
             {
 #ifdef DISPLAY_DEBUG
                 fprintf (stderr, "Evaluate Node: %s\n",(msra::strfun::utf8 ((*nodeIter)->NodeName())).c_str());
@@ -715,42 +687,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             LogicError("VerifyActualNumParallelSequences: mismatching MB size in MBLayout");
     }
 
-    void ComputeGradientLoop(std::list<ComputationNodeBasePtr>& /*allNodes*/, const ComputationNodeBasePtr startNode)
-    {
-        const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;
-        int iLoopId = FindInRecurrentLoops(startNode, pRecurrentNodes);
-        if (iLoopId != -1)
-        {
-            if (m_recurrentInfo[iLoopId].m_completedGradient == false)
-            {
-                size_t T = m_actualMBSize / GetNumParallelSequences();
-                if (m_recurrentInfo[iLoopId].m_isForwardLoop)
-                {
-                    for (size_t timeIndex = T; timeIndex --> 0; )
-                    {
-                        for (auto nodeIter = pRecurrentNodes->rbegin(); nodeIter != pRecurrentNodes->rend(); ++nodeIter)
-                        {
-                            (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences()); // TODO: move to FrameRange object
-                            (*nodeIter)->ComputeGradientForChildren(timeIndex);
-                        }
-                    }
-                }
-                else
-                {
-                    for (size_t timeIndex = 0; timeIndex < T; timeIndex++)
-                    {
-                        for (auto nodeIter = pRecurrentNodes->rbegin(); nodeIter != pRecurrentNodes->rend(); ++nodeIter)
-                        {
-                            (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
-                            (*nodeIter)->ComputeGradientForChildren(timeIndex);
-                        }
-                    }
-                }
-
-                m_recurrentInfo[iLoopId].m_completedGradient = true;
-            }
-        }
-    }
 
     // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
     // TODO: pass a set of nodes instead of only one
@@ -786,6 +722,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         if (rootGradientInitValue != nullptr)
             dynamic_pointer_cast<ComputationNode<ElemType>>(rootNode)->GradientValues().SetValue(*rootGradientInitValue);
 
+        // process nodes in pre-determined order
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
 #ifdef DISPLAY_DEBUG
@@ -793,7 +730,43 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                         (msra::strfun::utf8 ((*nodeIter)->OperationName())).c_str(),
                         (msra::strfun::utf8 ((*nodeIter)->NodeName())).c_str());
 #endif
-            ComputeGradientLoop(allNodes, *nodeIter);
+            // --- first, perform recurrent loops if this node participates in one
+
+            RecurrentInfo * recInfo = FindInRecurrentLoops(*nodeIter);
+            if (recInfo)
+            {
+                if (recInfo->m_completedGradient == false)
+                {
+                    const auto & recurrentNodes = recInfo->m_recurrentNodesForForward;
+                    size_t T = m_actualMBSize / GetNumParallelSequences();
+                    if (recInfo->m_isForwardLoop)
+                    {
+                        for (size_t t = T; t--> 0;)
+                        {
+                            for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
+                            {
+                                (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
+                                (*nodeIter)->ComputeGradientForChildren(t);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (size_t t = 0; t < T; t++)
+                        {
+                            for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
+                            {
+                                (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
+                                (*nodeIter)->ComputeGradientForChildren(t);
+                            }
+                        }
+                    }
+
+                    recInfo->m_completedGradient = true;
+                }
+            }
+
+            // --- second, do whole-batch operation if not recurrent
 
             (*nodeIter)->ComputeGradientForChildren();
         }
@@ -1237,10 +1210,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         std::list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, false);
 
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+        for (auto & nodeIter : nodes)
         {
-            (*nodeIter)->RequestEvalMatrices(m_matrixPool);
-            (*nodeIter)->ReleaseMatricesAfterEval(m_matrixPool);
+            nodeIter->RequestEvalMatrices(m_matrixPool);
+            nodeIter->ReleaseMatricesAfterEval(m_matrixPool);
         }
     }
 
@@ -1251,9 +1224,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         std::list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, false);
 
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+        for (auto & nodeIter : nodes)
         {
-            std::vector<ComputationNodeBasePtr> children = (*nodeIter)->GetChildren();
+            std::vector<ComputationNodeBasePtr> children = nodeIter->GetChildren();
             for (int i = 0; i < children.size(); i++)
                 numParents[children[i]] ++;
         }
@@ -1264,20 +1237,19 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         for (int i = 0; i < m_recurrentInfo.size(); i++)
             m_recurrentInfo[i].m_completedGradient = false;
 
-        for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
+        for (auto & nodeIter : allNodes)
         {
-            const std::vector<ComputationNodeBasePtr>* pRecurrentNodes;
-            int iLoopId = FindInRecurrentLoops(*nodeIter, pRecurrentNodes);
-            if (iLoopId != -1 && m_recurrentInfo[iLoopId].m_completedGradient == false)
+            RecurrentInfo * recInfo = FindInRecurrentLoops(nodeIter);
+            if (recInfo && !recInfo->m_completedGradient)
             {
-                for (auto nodeIterInLoop = pRecurrentNodes->rbegin(); nodeIterInLoop != pRecurrentNodes->rend(); ++nodeIterInLoop)
-                    AllocateGradientMatricesForChildren(*nodeIterInLoop, numParents);
-                m_recurrentInfo[iLoopId].m_completedGradient = true;
+                for (auto nodeIterInLoop : recInfo->m_recurrentNodesForForward)
+                    AllocateGradientMatricesForChildren(nodeIterInLoop, numParents);
+                recInfo->m_completedGradient = true;
             }
             else
-                AllocateGradientMatricesForChildren(*nodeIter, numParents);
+                AllocateGradientMatricesForChildren(nodeIter, numParents);
 
-            (*nodeIter)->ReleaseGradientMatrices(m_matrixPool);
+            nodeIter->ReleaseGradientMatrices(m_matrixPool);
         }
     }
 
@@ -1337,9 +1309,9 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         std::list<ComputationNodeBasePtr>&  nodes = GetEvalOrder(rootNode, false);
 
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        if (!(*nodeIter)->UnitTest())
-            return false;
+        for (auto & nodeIter : nodes)
+            if (!nodeIter->UnitTest())
+                return false;
 
         fprintf(stderr, "\n\n");
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index b873a9612345..7c95b89405fe 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -318,7 +318,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void ComputeGradientForChildren() = 0;
 
-        virtual void ComputeGradientForChildren(const size_t timeIdxInSeq) = 0;
+        virtual void ComputeGradientForChildren(const size_t timeIdxInSeq) = 0; // TODO: don't we need a FrameRange here, too?
 
         // TODO: some evaluation method to be abstracted, but types don't match
 
@@ -825,7 +825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         static void WINAPI SetToInitStateValueForResetSeg(const Matrix<ElemType>& sentenceBegin,
-            size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
+                                                          size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
         {
             Matrix<ElemType> colSeg(sentenceBegin.GetDeviceId());
             colSeg.Resize(nStream, nStream);

From 35350f7f13e980bbce8b3b0924bcff4bb0b80424 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 18 Sep 2015 22:50:09 -0700
Subject: [PATCH 16/44] (#if-0'd out an unused function)

---
 MachineLearning/CNTKComputationNetworkLib/ComputationNode.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 7c95b89405fe..4e5c9fa0b2e8 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -824,6 +824,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 MaskToZeroWhenLabelAndFeatureMissing(m_functionValues, timeIdxInSeq);
         }
 
+#if 0   // (this function cannot be used currently since sentenceBegin is not a Matrix<ElemType> anymore; only affects LSTMNode which is no longer used)
         static void WINAPI SetToInitStateValueForResetSeg(const Matrix<ElemType>& sentenceBegin,
                                                           size_t nStream, ElemType initStateValue, Matrix<ElemType>& newprevstate)
         {
@@ -849,6 +850,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             /// add default state value if it is for reset
             Matrix<ElemType>::MultiplyAndWeightedAdd(initStateValue, ones, false, colSeg, false, 1.0, newprevstate);  /// += [0 initStateValue 0 ]
         }
+#endif
 
         /**
         reset to error signals to 0 for any elements without labele

From 91d7d17e5d43094d1bfedcee23c4a701b6d62a16 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sat, 19 Sep 2015 17:14:14 -0700
Subject: [PATCH 17/44] new method DataSlice() that takes care of slicing in a
 unified manner. Not yet used, meant to be used for all derived classes' data
 access

---
 .../ComputationNode.h                         | 68 ++++++++++++++-----
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 4e5c9fa0b2e8..5abaae7026bb 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -128,7 +128,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Normally, N is 1 or it spans the entire minibatch.
         virtual void EvaluateThisNode(const FrameRange &) = 0;
         // evaluate a node--this calls EvaluateThisNode() and MaskToZeroWhenLabelAndFeatureMissing() if needed
-        // TODO: name this better--which is the main entry point?
+        // this is the main entry point for Network; while EvaluateThisNode() is the virtual call into specific node implementation
         virtual void EvaluateThisNodeGivenInputs() = 0;
         virtual void EvaluateThisNodeGivenInputs(const size_t timeIdxInSeq) = 0; // TODO: change to FrameRange as well
 
@@ -655,8 +655,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // ComputationNode -- abstract base class for computation nodes parameterized by float vs. double
     // =======================================================================
 
-    // TODO: number of inputs should be a template parameter! SIZE_MAX for those that take variable numvber
-
     template<class ElemType>
     class ComputationNode : public ComputationNodeBase //Abstract Class that cannot be instantiated
     {
@@ -768,10 +766,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_children[i] = UpCast(inputs[i]);      // (this checks the type)
         }
 
-        //making them virtual so that nodes that only copy values from it's children (e.g., dropout) can be efficient in evaluation
-        virtual const Matrix<ElemType>& FunctionValues() const {return m_functionValues;}
-        virtual Matrix<ElemType>& FunctionValues() { return m_functionValues;}
-
         virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const;
 
         // TODO: similar to DumpInfo; used by ExperimentalNetworkBuilder test implementation
@@ -808,14 +802,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 
-        /*implement*/ void EvaluateThisNodeGivenInputs()
+        /*implement*/void EvaluateThisNodeGivenInputs()
         {
-            EvaluateThisNode();
+            EvaluateThisNode();     // this is a call to the virtual function that implements the actual operation
 
             if (!UseCustomizedMultiSeqHandling())
                 MaskToZeroWhenLabelAndFeatureMissing(m_functionValues);
         }
 
+        // TODO: use a FrameRange arg, then unify with above
+        // TODO: do we even need this extra function? Should Node know about this masking business, or is that the job of Network?
+        // TODO: rename this to make it more clear what this function does
         /*implement*/void EvaluateThisNodeGivenInputs(const size_t timeIdxInSeq) // TODO: change to FrameRange as well
         {
             EvaluateThisNode(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
@@ -853,8 +850,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif
 
         /**
-        reset to error signals to 0 for any elements without labele
+        reset to error signals to 0 for any elements without labels
         */
+        // TODO: use a FrameRange instead of timeIdxSeq
         bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t timeIdxInSeq=(size_t)-1) const
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
@@ -954,9 +952,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 
-        const Matrix<ElemType>& GradientValues() const { return m_gradientValues; }
-        Matrix<ElemType>& GradientValues() { return m_gradientValues; }
-
         // up-cast to make life easier
         static ComputationNodePtr UpCast(ComputationNodeBasePtr inode)
         {
@@ -985,14 +980,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             // expand the inputs to exist up to the desired index
             while (childIndex >= m_children.size())
-            {
-                m_children.push_back(NULL);
-            }
+                m_children.push_back(nullptr);
 
             // set the input value
             m_children[childIndex] = node;
         }
 
+        //making them virtual so that nodes that only copy values from it's children (e.g., dropout) can be efficient in evaluation
+        virtual const Matrix<ElemType>& FunctionValues() const { return m_functionValues; }
+        virtual Matrix<ElemType>& FunctionValues() { return m_functionValues; }
+
+        const Matrix<ElemType>& GradientValues() const { return m_gradientValues; }
+        Matrix<ElemType>& GradientValues() { return m_gradientValues; }
+
+        // function to access any input and output, value and gradient, whole batch or single frame
+        // Note: This returns an object, not a reference. That object is a column slice, i.e. a small object that just points into another object.
+        // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
+        // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
+#define INDEX_OUT SIZE_MAX
+#define SEQUENCE_ALL SIZE_MAX
+        enum ValueOrGradient { VAL, GRAD };
+        Matrix<ElemType> DataSlice(size_t index/*input index or OUT*/,
+                                   ValueOrGradient valueOrGradient/*as it says*/,
+                                   FrameRange frameRange/*select frame or entire batch*/, size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
+        {
+            ComputationNode<ElemType> * node = (index == INDEX_OUT) ? this : Inputs(index).get();
+            Matrix<ElemType> & data = (valueOrGradient == VAL) ? node->FunctionValues() : node->GradientValues();
+            if (frameRange.IsAllFrames())
+            {
+                if (sequence == SEQUENCE_ALL)
+                    return data.ColumnSlice(0, data.GetNumCols());
+                else
+                    LogicError("DataSlice: sequence index only supported when accessing individual frame"); // (not needed; doable but more involved, requiring a reshape)
+            }
+            else
+            {
+                size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
+                size_t startColumn = frameRange.t() * numParallelSequences;
+                if (sequence == SEQUENCE_ALL)
+                    return data.ColumnSlice(startColumn, numParallelSequences);
+                else
+                    return data.ColumnSlice(startColumn + sequence, 1);
+            }
+            // TODO:
+        }
+
+        // this is the entry point from Network; while it will call virtual ComputeInputPartial() into the actual node implementation
         /*implement*/void ComputeGradientForChildren()
         {
             // batch is done only for feed-forward nodes
@@ -1024,7 +1057,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif              
             }
         }
-
+        
+        // TODO: use a FrameRange here as well, then unify with above
         /*implement*/void ComputeGradientForChildren(const size_t timeIdxInSeq)
         {
             for (size_t i=0; i<m_children.size(); i++)

From 04e0623d0208748f6359871aa5560ead5279c7fc Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Sun, 20 Sep 2015 21:59:16 -0700
Subject: [PATCH 18/44] Fixed a bug in  the code to enable device memory peer
 access

---
 Math/Math/GPUMatrix.cu       | 6 +++++-
 Math/Math/GPUSparseMatrix.cu | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 228f9467778b..aaaeb36b2b85 100755
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -298,7 +298,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
             if (canAccessPeer)
             {
-                CUDA_CALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
+                cudaError_t cudaStatus = cudaDeviceEnablePeerAccess(m_computeDevice, 0);
+                if (cudaStatus != cudaErrorPeerAccessAlreadyEnabled)
+                {
+                    CUDA_CALL(cudaStatus);
+                }
                 CUDA_CALL(cudaMemcpyPeer(d_dst,to_id,m_pArray,m_computeDevice,sizeof(ElemType)*m_numRows*m_numCols));  
             }
             else
diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index 5f3461ae0a92..c53ade5dd445 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -418,7 +418,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CUDACALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice));
             if (canAccessPeer)
             {
-                CUDACALL(cudaDeviceEnablePeerAccess(m_computeDevice, 0));
+                cudaError_t cudaStatus = cudaDeviceEnablePeerAccess(m_computeDevice, 0);
+                if (cudaStatus != cudaErrorPeerAccessAlreadyEnabled)
+                {
+                    CUDACALL(cudaStatus);
+                }
                 CUDACALL(cudaMemcpyPeer(d_dst, to_id, m_pArray, m_computeDevice, m_totalBufferSizeAllocated));
             }
             else

From e8274968b84f5ed9ac43ec4df66ffea2d59ffe39 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 10:18:31 -0700
Subject: [PATCH 19/44] changed a few "for (auto x :" to "for ([const] auto & x
 :"

---
 .../LMSequenceReader/SequenceReader.cpp       |  6 +--
 .../LUSequenceReader/LUSequenceReader.cpp     |  2 +-
 DataReader/UCIFastReader/UCIFastReader.cpp    |  4 +-
 MachineLearning/CNTK/ModelEditLanguage.cpp    | 10 ++---
 MachineLearning/CNTK/ModelEditLanguage.h      |  2 +-
 MachineLearning/CNTK/NDLNetworkBuilder.h      |  2 +-
 .../CNTK/NetworkDescriptionLanguage.h         |  2 +-
 .../ComputationNetwork.cpp                    | 44 +++++++++----------
 .../ComputationNetwork.h                      | 10 ++---
 .../NetworkBuilderFromConfig.cpp              |  2 +-
 10 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp
index 854705ece8cd..bf9c9fb31bc1 100644
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@@ -655,7 +655,7 @@ void SequenceReader<ElemType>::ReadClassInfo(const wstring & vocfile, int& class
         LogicError("SequenceReader::ReadClassInfo the actual number of words %d is smaller than the specified vocabulary size %d. Check if labelDim is too large. ", idx4class.size(), nwords);
     }
     std::vector<double> counts(idx4cnt.size());
-    for (auto p : idx4cnt)
+    for (const auto & p : idx4cnt)
         counts[p.first] = (double)p.second;
     m_noiseSampler = noiseSampler<long>(counts);
 
@@ -689,7 +689,7 @@ void SequenceReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
                 found = true;
         }
         FindConfigNames(readerConfig, "wfile", names);
-        for (auto name : names)
+        for (const auto & name : names)
         {
             ConfigParameters config = readerConfig(name);
             filesList.push_back(config("wfile"));
@@ -714,7 +714,7 @@ void SequenceReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
             // now get the section names for map and category types
             std::map<std::wstring, SectionType, nocase_compare> sections;
             m_cachingWriter->GetSections(sections);
-            for (auto pair : sections)
+            for (const auto & pair : sections)
             {
                 // TODO: we would need to add a sequenceMap type here as well
                 // or maybe change to heirarchal name (i.e. root.labelIn.map)
diff --git a/DataReader/LUSequenceReader/LUSequenceReader.cpp b/DataReader/LUSequenceReader/LUSequenceReader.cpp
index e50eab054989..df9bcc801598 100644
--- a/DataReader/LUSequenceReader/LUSequenceReader.cpp
+++ b/DataReader/LUSequenceReader/LUSequenceReader.cpp
@@ -1272,7 +1272,7 @@ void MultiIOBatchLUSequenceReader<ElemType>::CopyMBLayoutTo(MBLayoutPtr pMBLayou
     /// run for each reader
     vector<size_t> col;
     size_t rows = 0, cols = 0;
-    for (auto p : mReader)
+    for (const auto & p : mReader)
     {
         p.second->CopyMBLayoutTo(pMBLayout);
         if (rows == 0)
diff --git a/DataReader/UCIFastReader/UCIFastReader.cpp b/DataReader/UCIFastReader/UCIFastReader.cpp
index e7aeea2a8871..9f087757fe82 100644
--- a/DataReader/UCIFastReader/UCIFastReader.cpp
+++ b/DataReader/UCIFastReader/UCIFastReader.cpp
@@ -450,7 +450,7 @@ void UCIFastReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
                 found = true;
         }
         FindConfigNames(readerConfig, "wfile", names);
-        for (auto name : names)
+        for (const auto & name : names)
         {
             ConfigParameters config = readerConfig(name);
             filesList.push_back(config("wfile"));
@@ -475,7 +475,7 @@ void UCIFastReader<ElemType>::InitCache(const ConfigParameters& readerConfig)
             // now get the section names for map and category types
             std::map<std::wstring, SectionType, nocase_compare> sections;
             m_cachingWriter->GetSections(sections);
-            for (auto pair : sections)
+            for (const auto & pair : sections)
             {
                 if (pair.second == sectionTypeCategoryLabel)
                 {
diff --git a/MachineLearning/CNTK/ModelEditLanguage.cpp b/MachineLearning/CNTK/ModelEditLanguage.cpp
index a31c4f386574..6c700133ebf1 100644
--- a/MachineLearning/CNTK/ModelEditLanguage.cpp
+++ b/MachineLearning/CNTK/ModelEditLanguage.cpp
@@ -345,7 +345,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
 
         // process outstanding NDL scripts ensuring that the inputs have all been resolved
         ProcessNDLScript(netNdlFrom, ndlPassResolve); 
-        for (auto node : nodeTo)
+        for (auto & node : nodeTo)
         {
             node->SetInput(inputNum, nodeFrom[0]);
         }
@@ -442,7 +442,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         ProcessNDLScript(netNdl, ndlPassInitial, false);
 
         ComputationNetwork* cn = netNdl->cn;
-        for (auto node : nodes)
+        for (auto & node : nodes)
         {
             switch(prop)
             {
@@ -524,7 +524,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         // make sure all NDL links have been resolved
         ProcessNDLScript(netNdl, ndlPassResolve);
 
-        for (auto node : nodes)
+        for (auto & node : nodes)
         {
             switch(prop)
             {
@@ -562,7 +562,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
 
             if (nodes.size() < 1)
                 RuntimeError("Delete must have at least one target, %s doesn't represent any items", params[i].c_str());
-            for (auto node : nodes)
+            for (const auto & node : nodes)
             {
                 netNdl->cn->DeleteNode(node->NodeName());
             }
@@ -603,7 +603,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         NetNdl<ElemType>* netNdl; 
         vector<ComputationNodeBasePtr> nodes = FindSymbols(params[0], netNdl);
 
-        for (auto pNodes : nodes)
+        for (auto & pNodes : nodes)
         {
             if (pNodes->OperationName() != LearnableParameter<ElemType>::TypeName())
             {
diff --git a/MachineLearning/CNTK/ModelEditLanguage.h b/MachineLearning/CNTK/ModelEditLanguage.h
index cda901fb57cb..85ee0297b479 100644
--- a/MachineLearning/CNTK/ModelEditLanguage.h
+++ b/MachineLearning/CNTK/ModelEditLanguage.h
@@ -263,7 +263,7 @@ class MELScript: public ConfigParser
 
             // this is the *.W = L2.W case
             // We want to find all the destination existing matches and then assign the in node to all of them
-            for (auto node : nodesOut)
+            for (const auto & node : nodesOut)
             {
                 std::wstring nodeOutName = node->NodeName();
                 GenNameValue value(nodeIn, nodeOutName);
diff --git a/MachineLearning/CNTK/NDLNetworkBuilder.h b/MachineLearning/CNTK/NDLNetworkBuilder.h
index 3a9b2a669dc3..da741c6b8bac 100644
--- a/MachineLearning/CNTK/NDLNetworkBuilder.h
+++ b/MachineLearning/CNTK/NDLNetworkBuilder.h
@@ -138,7 +138,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     // "load" parameter are in fact loaded (if they were all processed at once, the last file's "load"
                     // parameter would override all the earlier ones, and those sections wouldn't get loaded).
                     std::vector<std::string> filePathVec = msra::strfun::split(ndlMacrosPaths, "+");
-                    for (auto filePath : filePathVec)
+                    for (const auto & filePath : filePathVec)
                     {
                         ndlScript.LoadConfigFileAndResolveVariables(msra::strfun::utf16(filePath), config);
                     }
diff --git a/MachineLearning/CNTK/NetworkDescriptionLanguage.h b/MachineLearning/CNTK/NetworkDescriptionLanguage.h
index 0894d7efa140..4b92b399f010 100644
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.h
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.h
@@ -577,7 +577,7 @@ class NDLScript: public ConfigParser
     {
         vector<NDLNode<ElemType>*> result;
         std::string empty;
-        for (auto symbol : m_symbols)
+        for (auto & symbol : m_symbols)
         {
             NDLNode<ElemType>* node = symbol.second;
             std::string value = node->GetOptionalParameter(optParamName, empty);
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index 5e73afcfe4f1..cbcb2ed1ee95 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -326,9 +326,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return false;
     }
 
+    // TODO: comment on who owns this flag. Is it entirely owned by Network?
+    // Or should the 4 node types below know?
     void ComputationNetwork::SetNodesReqMultiSeqHandling()
     {
-        for (auto node : m_nodesReqMultiSeqHandling)
+        for (auto & node : m_nodesReqMultiSeqHandling)
         {
             //SumElements node will generate a scalar value and so it should never require special handling
             //TransposeNode will change the size of columns and so it should also not included for special handling
@@ -343,11 +345,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         //if a typical criterion node is used as the training criterion node we assume it requires multiseq handling 
         //this is for backward compatibility
-        for (auto node : m_finalCriteria)
+        for (auto & node : m_finalCriteria)
             if (IsTypicalCriterionNode(node))
                 node->SetReqMultiSeqHandlingTo(true);
 
-        for (auto node : m_evalNodes)
+        for (auto & node : m_evalNodes)
             if (IsTypicalCriterionNode(node))
                 node->SetReqMultiSeqHandlingTo(true);
     }
@@ -408,8 +410,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     void ComputationNetwork::ClearCalcOrderCaches()
     {
-        for (auto it : m_cacheEvalOrders)
-            for (auto iter2 : m_cacheEvalOrders[it.first])
+        for (auto & it : m_cacheEvalOrders)
+            for (auto & iter2 : m_cacheEvalOrders[it.first])
                 iter2->ClearCache();
         m_cacheEvalOrders.clear();
         m_cacheGradientCalcOrders.clear();
@@ -681,7 +683,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         
         DetermineLoopTypes();
         
-        for (auto iter : nodes)
+        for (auto & iter : nodes)
             iter->ClearCache();
     }
 
@@ -1097,7 +1099,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         wstring str = style;
 
-        for (auto x : specialNodes)
+        for (const auto & x : specialNodes)
             str = str + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
         return str + L"; \n";
     }
@@ -1112,7 +1114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // get precompute node
         std::vector<ComputationNodeBasePtr> PreComputedNodes;
         std::vector<ComputationNodeBasePtr> allnodes = GetAllNodes();
-        for (auto n : allnodes)
+        for (const auto & n : allnodes)
         {
             if (n->RequiresPreCompute())
                 PreComputedNodes.push_back(n);
@@ -1120,7 +1122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // get PastValue node
         std::vector<ComputationNodeBasePtr> pastValueNodes;
-        for (auto n : allnodes)
+        for (const auto & n : allnodes)
         {
             if (n->OperationName() == OperationNameOf(PastValueNode) || n->OperationName() == L"Delay")
                 pastValueNodes.push_back(n);
@@ -1128,14 +1130,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // get FuturetValue node
         std::vector<ComputationNodeBasePtr> futureValueNodes;
-        for (auto n : allnodes)
+        for (const auto & n : allnodes)
         {
             if (n->OperationName() == OperationNameOf(FutureValueNode))
                 futureValueNodes.push_back(n);
         }
         // get learnableParameters
         std::vector<ComputationNodeBasePtr> learnableParameters;
-        for (auto n : allnodes)
+        for (const auto & n : allnodes)
         {
             if (n->OperationName() == OperationNameOf(LearnableParameter))
                 learnableParameters.push_back(n);
@@ -1173,7 +1175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //////////////////////////////////////////////////////////////////////////
         fstream << L"\n// add labels and operation name\n";
         wstring line;
-        for (auto x : allnodes)
+        for (const auto & x : allnodes)
         {
             line.clear();
             size_t nrows = x->GetNumRows();
@@ -1191,25 +1193,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         fstream << L"subgraph {\n";
         fstream << L"\t\t rank=source ; ";
         line.clear();
-        for (auto x : m_features)
-        {
+        for (const auto & x : m_features)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        }
         fstream << line << L"\n}\n";
 
         // subgraph eval/output/criteria
         fstream << L"subgraph {\n";
         fstream << L"\t\t rank=sink ; ";
         line.clear();
-        for (auto x : m_finalCriteria)
+        for (const auto & x : m_finalCriteria)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        for (auto x : m_nodesReqMultiSeqHandling)
+        for (const auto & x : m_nodesReqMultiSeqHandling)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        for (auto x : m_outputNodes)
+        for (const auto & x : m_outputNodes)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        for (auto x : m_pairNodes)
+        for (const auto & x : m_pairNodes)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        for (auto x : m_evalNodes)
+        for (const auto & x : m_evalNodes)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
 
         fstream << line << L"\n}\n";
@@ -1294,7 +1294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         vector<pair<vector<wstring>, float>> nodeGroups;
         wregex NameFilter;
 
-        for (auto e : SVDConfig)
+        for (const auto & e : SVDConfig)
         {
             wstring regexStr = e.first;
             float keepRatio = e.second;
@@ -1336,7 +1336,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             fprintf(stderr,
                     "--------------------------------------------------------------------------------------------\n");
 
-            for (auto name : group.first)
+            for (const auto & name : group.first)
             {
                 if (m_nameToNodeMap.find(name) == m_nameToNodeMap.end())
                 {
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 107f6f409c77..afe489328c04 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -647,7 +647,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
         // resize function values and gradients of everything in m_recurrentInfo
         for (int i = 0; i < m_recurrentInfo.size(); i++)
-            for (auto nodeIter : m_recurrentInfo[i].m_recurrentNodes)
+            for (auto & nodeIter : m_recurrentInfo[i].m_recurrentNodes)
                 nodeIter->SetFunctionAndGradientSize(m_actualMBSize);
     }
 
@@ -1242,7 +1242,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             RecurrentInfo * recInfo = FindInRecurrentLoops(nodeIter);
             if (recInfo && !recInfo->m_completedGradient)
             {
-                for (auto nodeIterInLoop : recInfo->m_recurrentNodesForForward)
+                for (auto & nodeIterInLoop : recInfo->m_recurrentNodesForForward)
                     AllocateGradientMatricesForChildren(nodeIterInLoop, numParents);
                 recInfo->m_completedGradient = true;
             }
@@ -1273,7 +1273,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // first give criteria nodes as root node
         if (FinalCriterionNodes().size() > 0)
         {
-            for (auto node : FinalCriterionNodes())
+            for (auto & node : FinalCriterionNodes())
             {
                 if (!allowFragment)
                     FormRecurrentLoops(node);
@@ -1287,7 +1287,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // now output nodes
         if (OutputNodes().size() > 0)
         {
-            for (auto node : OutputNodes())
+            for (auto & node : OutputNodes())
             if (!UnitTest(node))
                 vErrors.push_back(node->NodeName().c_str());
         }
@@ -1296,7 +1296,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // now evaluation nodes
         if (EvaluationNodes().size() > 0)
         {
-            for (auto node : EvaluationNodes())
+            for (auto & node : EvaluationNodes())
             if (!UnitTest(node))
                 vErrors.push_back(node->NodeName().c_str());
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp b/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
index ad745cf56893..d2d04b7ad0d7 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
@@ -687,7 +687,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
 
             // traverse children: append them to the end of the work list
             let children = node->GetChildren();
-            for (auto child : children)
+            for (auto & child : children)
                 workList.push_back(child);  // (we could check whether c is in 'nodes' already here to optimize, but this way it is cleaner)
         }
 

From 4a07903dd1b98b1939657e5fb1e5f80373fca7ac Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 10:38:55 -0700
Subject: [PATCH 20/44] first use of DataSlice()

---
 .../CNTKComputationNetworkLib/ComputationNode.h   | 15 +++++++++------
 .../CNTKComputationNetworkLib/RecurrentNodes.h    |  8 ++++----
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 5abaae7026bb..c2cf5e17552c 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -997,15 +997,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Note: This returns an object, not a reference. That object is a column slice, i.e. a small object that just points into another object.
         // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
         // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
-#define INDEX_OUT SIZE_MAX
+        // TODO: move sequence into FrameRange object
+        enum Index : size_t { OUTPUT = SIZE_MAX };
 #define SEQUENCE_ALL SIZE_MAX
-        enum ValueOrGradient { VAL, GRAD };
+        enum ValueOrGradient { VALUE, GRADIENT };
         Matrix<ElemType> DataSlice(size_t index/*input index or OUT*/,
                                    ValueOrGradient valueOrGradient/*as it says*/,
-                                   FrameRange frameRange/*select frame or entire batch*/, size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
+                                   const FrameRange & frameRange/*select frame or entire batch*/, size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
         {
-            ComputationNode<ElemType> * node = (index == INDEX_OUT) ? this : Inputs(index).get();
-            Matrix<ElemType> & data = (valueOrGradient == VAL) ? node->FunctionValues() : node->GradientValues();
+            ComputationNode<ElemType> * node = (index == OUTPUT) ? this : Inputs(index).get();
+            Matrix<ElemType> & data = (valueOrGradient == VALUE) ? node->FunctionValues() : node->GradientValues();
             if (frameRange.IsAllFrames())
             {
                 if (sequence == SEQUENCE_ALL)
@@ -1015,7 +1016,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             else
             {
-                size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
+                size_t numParallelSequences = m_pMBLayout->GetNumParallelSequences();
+                if (numParallelSequences != frameRange.samplesInRecurrentStep)
+                    LogicError("DataSlice: inconsistent samplesInRecurrentStep");   // TODO: this will go away when we remove this memebr from FrameRange
                 size_t startColumn = frameRange.t() * numParallelSequences;
                 if (sequence == SEQUENCE_ALL)
                     return data.ColumnSlice(startColumn, numParallelSequences);
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index aaba256771d2..d0a73d2bac4e 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -206,9 +206,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this one differs in the starting condition
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) = 0;
 
-        static void WINAPI EvaluateThisNodeSRP(const FrameRange & frameRange, const int timeStep,
-                                               Matrix<ElemType>& functionValues, const Matrix<ElemType>& delayedActivation, const Matrix<ElemType>& inputFunctionValues,
-                                               const ElemType & initStateValue, const Matrix<float> & colBoundaryFlags, const MinibatchPackingFlags minibatchPackingFlags)
+        void EvaluateThisNodeSRP(const FrameRange & frameRange, const int timeStep,
+                                 Matrix<ElemType>& functionValues, const Matrix<ElemType>& delayedActivation, const Matrix<ElemType>& inputFunctionValues,
+                                 const ElemType & initStateValue, const Matrix<float> & colBoundaryFlags, const MinibatchPackingFlags minibatchPackingFlags)
         {
             size_t timeIdxInSeq = frameRange.t();
             size_t mNbr = frameRange.NumCols();
@@ -225,7 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 d = (int)functionValues.Mod((float)delayedIndex, (float)delayedActivation.GetNumCols());
             // this can point to the past activity of the previous minibatch
 
-            Matrix<ElemType> out = functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq * mNbr, mNbr);
+            Matrix<ElemType> out = DataSlice(OUTPUT, VALUE, frameRange);
             Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
 
             if (minibatchPackingFlags & SequenceStart_or_End)

From 8307e74824e0ecf1c268bc8393338dca627cf19a Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 10:58:44 -0700
Subject: [PATCH 21/44] new method FrameSlice() that takes a pMBLayout, in prep
 of getting samplesInRecurrentStep out from FrameRange, in prep of allowing
 inconsistent layouts across the graph; moved FrameSlice() from .h to .cpp

---
 Math/Math/Matrix.cpp |  23 ++++++++++
 Math/Math/Matrix.h   | 101 +++++++++++++++++++++----------------------
 2 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index e122905be16f..e65ed384a7a0 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -772,6 +772,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return slice;
     }
 
+    // special convenience function to apply ColumnSlice() to getting a frame range
+    // It assumes that columns are frames, and returns a sub-range.
+    // TODO: decide whether this belongs here or elsewhere
+    // TODO: remove this one, as it does not take #slices explicitly, which will be needed in the future
+    template<class ElemType>
+    Matrix<ElemType> Matrix<ElemType>::FrameSlice(const FrameRange & frameRange
+        // TODO: temporary only until this has been tested to work:
+        , size_t expectedStartColumn, size_t expectedNumCols
+        ) const
+    {
+        if (frameRange.IsAllFrames()) return ColumnSlice(0, GetNumCols());  // TODO: can we just return a reference to ourselves? --ownership problem
+        // TODO: temporary only until this has been tested to work:
+        if (expectedStartColumn != frameRange.StartColumn() || expectedNumCols != frameRange.NumCols())
+            LogicError("FrameSlice: FrameRange object gives different range than original explicit code. Logic is borked.");
+        return ColumnSlice(frameRange.StartColumn(), frameRange.NumCols());
+    }
+    template<class ElemType>
+    Matrix<ElemType> Matrix<ElemType>::FrameSlice(const FrameRange & frameRange, const shared_ptr<MBLayout> & pMBLayout) const
+    {
+        if (frameRange.IsAllFrames()) return ColumnSlice(0, GetNumCols());  // TODO: can we just return a reference to ourselves? --ownership problem
+        return ColumnSlice(frameRange.StartColumn(pMBLayout), frameRange.NumCols(pMBLayout));
+    }
+
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
     {            
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 34eac851b7f8..db99e0875956 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -25,44 +25,6 @@
 // This class is exported from the Math.dll
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    // there is a version down there of ColumnSlice() that abstracts the number of streams
-    // TODO: This may not belong here, but having it in ComputeNode would require syntax changes, while having it as a member here only requires a local find-replace. Let's make it work first, then decide how to refactor.
-    // the looping versions of EvaluateThisNode() and ComputeInputPartial() take a frame range, through this structure
-    // It can cast from a size_t, i.e. those functions can be called passing a size_t in place of the FrameRange.
-    // TODO: GetNumParallelSequences() should be subsumed here & removed from nodes
-    // TODO: Where this design currently breaks:
-    //  - BatchModeNodes must access GetNumParallelSequences(), yet operate on the whole sequence
-    //  - likewise, LSTMNode does its own iteration, hence needs access to GetNumParallelSequences() or NumCols() in the whole-batch iterator
-    //  - RecurrentNodes access frames with a time shift, where out-of-bounds ones access a different matrix' values
-    //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
-    //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)
-    // TODO: Turns out, a FrameRange is either a whole batch or a single frame.
-    struct FrameRange
-    {
-        const size_t timeIdxInSeq;              // start frame
-        const size_t samplesInRecurrentStep;    // number of samples in this step       --BUGBUG: this should be part of MBLayout, not FrameRange
-        // can construct from a single size_t -> a single-frame range
-        //FrameRange(size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(0)/*FIX THIS*/{}
-        FrameRange(size_t timeIdxInSeq, size_t samplesInRecurrentStep) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(samplesInRecurrentStep){}
-        // or without arguments -> entire minibatch / no frame-range
-        FrameRange() : timeIdxInSeq(0), samplesInRecurrentStep(SIZE_MAX) {}
-        // code that can only handle single-frame ranges will call t() to get the time index, which will throw if numFrames != 1
-        // Some functions need just the time index, e.g. for looking up stuff in m_boundaryInfo. That's where an unscaled index is needed (as opposed to startColumn()).
-        size_t t() const { EnsureNotAllFrames(); return timeIdxInSeq; }
-        // multi-frame slice case: these two get startFrame and numFrames
-        size_t StartColumn() const { EnsureNotAllFrames(); return timeIdxInSeq * samplesInRecurrentStep; }
-        size_t NumCols() const { EnsureNotAllFrames(); return samplesInRecurrentStep; }
-        bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
-    private:
-        FrameRange(const FrameRange & other);// : timeIdxInSeq(other.timeIdxInSeq), numFrames(other.numFrames) { }
-        void operator=(const FrameRange &);
-        void EnsureNotAllFrames() const
-        {
-            if (IsAllFrames())
-                LogicError("FrameRange::t() called when frame range refers to whole minibatch");
-        }
-    };
-
     enum CurrentDataLocation
     {
         NONE, CPU, GPU, BOTH
@@ -184,19 +146,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
 
         // special convenience function to apply ColumnSlice() to getting a frame range
-        // It assumes that columns are frames, and returns a sub-range.
-        // TODO: decide whether this belongs here or elsewhere
-        Matrix<ElemType> FrameSlice(const FrameRange & frameRange
-            // TODO: temporary only until this has been tested to work:
-            , size_t expectedStartColumn, size_t expectedNumCols
-            ) const
-        {
-            if (frameRange.IsAllFrames()) return ColumnSlice(0, GetNumCols());  // TODO: can we just return a reference to ourselves? --ownership problem
-            // TODO: temporary only until this has been tested to work:
-            if (expectedStartColumn != frameRange.StartColumn() || expectedNumCols != frameRange.NumCols())
-                LogicError("FrameSlice: FrameRange object gives different range than original explicit code. Logic is borked.");
-            return ColumnSlice(frameRange.StartColumn(), frameRange.NumCols());
-        }
+        Matrix<ElemType> FrameSlice(const struct FrameRange & frameRange, size_t expectedStartColumn, size_t expectedNumCols) const;
+        Matrix<ElemType> FrameSlice(const struct FrameRange & frameRange, const shared_ptr<struct MBLayout> & pMBLayout) const;
 
         // difference between AssignColumnSlice and SetColumnSlice 
         // AssignColumnSlice :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1) 
@@ -632,4 +583,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     };
     typedef std::shared_ptr<MBLayout> MBLayoutPtr;
 
+    // there is a version down there of ColumnSlice() that abstracts the number of streams
+    // TODO: This may not belong here, but having it in ComputeNode would require syntax changes, while having it as a member here only requires a local find-replace. Let's make it work first, then decide how to refactor.
+    // the looping versions of EvaluateThisNode() and ComputeInputPartial() take a frame range, through this structure
+    // It can cast from a size_t, i.e. those functions can be called passing a size_t in place of the FrameRange.
+    // TODO: GetNumParallelSequences() should be subsumed here & removed from nodes
+    // TODO: We should also have a FrameRange that selects a single sequence instead of all.
+    // TODO: Where this design currently breaks:
+    //  - BatchModeNodes must access GetNumParallelSequences(), yet operate on the whole sequence
+    //  - likewise, LSTMNode does its own iteration, hence needs access to GetNumParallelSequences() or NumCols() in the whole-batch iterator
+    //  - RecurrentNodes access frames with a time shift, where out-of-bounds ones access a different matrix' values
+    //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
+    //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)
+    // TODO: This will in the future be able to hold sub-ranges for nested loops as well.
+    struct FrameRange
+    {
+        const size_t timeIdxInSeq;              // start frame
+        const size_t samplesInRecurrentStep;    // number of samples in this step       --BUGBUG: this should be part of MBLayout, not FrameRange
+        // can construct from a single size_t -> a single-frame range
+        //FrameRange(size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(0)/*FIX THIS*/{}
+        FrameRange(size_t timeIdxInSeq, size_t samplesInRecurrentStep) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(samplesInRecurrentStep){}
+        // or without arguments -> entire minibatch / no frame-range
+        FrameRange() : timeIdxInSeq(0), samplesInRecurrentStep(SIZE_MAX/*all frames (map)*/) {}
+        // code that can only handle single-frame ranges will call t() to get the time index, which will throw if numFrames != 1
+        // Some functions need just the time index, e.g. for looking up stuff in m_boundaryInfo. That's where an unscaled index is needed (as opposed to startColumn()).
+        size_t t() const { EnsureNotAllFrames(); return timeIdxInSeq; }
+        // multi-frame slice case: these two get startFrame and numFrames
+        size_t StartColumn() const { EnsureNotAllFrames(); return timeIdxInSeq * samplesInRecurrentStep; }
+        size_t NumCols() const { EnsureNotAllFrames(); return samplesInRecurrentStep; }
+        // TODO: remove these ^^ two in favor of these vv
+        size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); VerifyMBLayout(pMBLayout); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
+        size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); VerifyMBLayout(pMBLayout); return pMBLayout->GetNumParallelSequences(); }
+        bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
+    private:
+        FrameRange(const FrameRange & other);// : timeIdxInSeq(other.timeIdxInSeq), numFrames(other.numFrames) { }
+        void operator=(const FrameRange &);
+        void EnsureNotAllFrames() const
+        {
+            if (IsAllFrames())
+                LogicError("FrameRange::t() called when frame range refers to whole minibatch");
+        }
+        // TODO: this will go away once we remove samplesInRecurrentStep from this class
+        void VerifyMBLayout(const shared_ptr<MBLayout> & pMBLayout) const
+        {
+            if (pMBLayout->GetNumParallelSequences() != samplesInRecurrentStep)
+                LogicError("VerifyMBLayout: MBLayout inconsistent with local copy of samplesInRecurrentStep");
+        }
+    };
+
 }}}

From 218ab2ac3bc7d246a07670a7354fe04740370e81 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 11:42:02 -0700
Subject: [PATCH 22/44] changed all FrameSlice() calls to pass pMBLayout, in
 prep of changing all of these to DataSlice(); removed the original version of
 FrameSlice()

---
 .../CompositeComputationNodes.h               |  14 +-
 .../ComputationNode.h                         |   2 +-
 .../ConvolutionalNodes.h                      |  22 +-
 .../InputAndParamNodes.h                      |  20 +-
 .../LinearAlgebraNodes.h                      | 216 +++++++++---------
 .../NonlinearityNodes.h                       | 122 +++++-----
 .../RecurrentNodes.h                          |  36 +--
 .../TrainingCriterionNodes.h                  |  16 +-
 Math/Math/Matrix.cpp                          |   2 +
 Math/Math/Matrix.h                            |  16 +-
 10 files changed, 242 insertions(+), 224 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 251e24a388ca..0ef020705100 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,8 +539,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -690,8 +690,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -840,9 +840,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //FunctionValues().Resize(m_memory.GetNumRows(), GetNumParallelSequences());
             FunctionValues().Resize(m_memory.GetNumRows(), frameRange.NumCols());   // extra space for one time step
             if (frameRange.t() == 0)    // for first frame, check that we got all in memory  --TODO: is this comment correct? How about going backwards?
-                assert(FunctionValues().FrameSlice(FrameRange(0, GetNumParallelSequences())/*TODO: delete the next two parameters*/, 0, GetNumParallelSequences()).FrobeniusNorm() == m_memory.FrameSlice(FrameRange(0, GetNumParallelSequences())/*TODO: delete the next two parameters*/, 0, GetNumParallelSequences()).FrobeniusNorm());
-                //assert(FunctionValues().ColumnSlice(0, GetNumParallelSequences()).FrobeniusNorm() == m_memory.ColumnSlice(0, GetNumParallelSequences()).FrobeniusNorm());
-            FunctionValues().SetValue(m_memory.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()));
+                assert(FunctionValues().FrameSlice(FrameRange(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm() == m_memory.FrameSlice(FrameRange(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm());
+                //assert(FunctionValues().ColumnSlice(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm() == m_memory.ColumnSlice(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm());
+            FunctionValues().SetValue(m_memory.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
             assert(FunctionValues().GetNumCols() == GetNumParallelSequences());
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index c2cf5e17552c..35427db341a4 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -215,7 +215,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // This is used at 284 places inside nodes, most of the time as
-        // FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences())
+        // FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout)
         size_t GetNumParallelSequences() const
         {
             //return m_samplesInRecurrentStep;
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index 8d591bfe9d94..a497972dbc28 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -111,14 +111,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("Convolution operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,8 +215,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
 
@@ -433,11 +433,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
         }
@@ -447,8 +447,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index cd9b6dec72a7..02c61c76d794 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -348,15 +348,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -402,8 +402,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -573,8 +573,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_functionValues.GetNumRows() == GradientValues().GetNumRows()); // original used m_functionValues.GetNumRows() for loop dimension
             assert(m_pMBLayout);
 
-            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType>::ScaleAndAdd(1.0, GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), mTmp);
+            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType>::ScaleAndAdd(1.0, GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout), mTmp);
         }
 
         virtual void EvaluateThisNode()
@@ -584,8 +584,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            mTmp.SetValue(Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()));
+            Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            mTmp.SetValue(Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index 69810bfd5674..4ea368de7937 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,8 +53,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -71,8 +71,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
 
@@ -138,8 +138,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -156,8 +156,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -233,8 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -251,8 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -370,8 +370,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
         }
@@ -388,8 +388,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
         }
@@ -486,8 +486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex >= ChildrenSize())
                 InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
         }
@@ -504,7 +504,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceFunctionValues = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceFunctionValues = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
         }
@@ -623,15 +623,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //left Node must be a scalar
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -654,8 +654,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -742,15 +742,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -801,8 +801,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -915,15 +915,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -969,8 +969,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -1073,10 +1073,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad);
         }
@@ -1100,9 +1100,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1202,10 +1202,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("RowElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)
             {
@@ -1252,9 +1252,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1353,17 +1353,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ColumnElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,8 +1403,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
         }
@@ -1509,13 +1509,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (cols0 >= cols1)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialS(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
             }
@@ -1584,25 +1584,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1780,11 +1780,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> ones = Matrix<ElemType>();
 
@@ -1890,25 +1890,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2048,16 +2048,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("DiagTimes operation only takes two inputs.");
 
             //left parameter (diag matix cannot be sliced)
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,8 +2083,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
         }
@@ -2205,11 +2205,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
@@ -2280,9 +2280,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
         }
@@ -2426,19 +2426,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("KhatriRaoProduct operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,9 +2461,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
         }
@@ -2564,11 +2564,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(inputIndex, m_invNorm0, m_invNorm1, sliceOutputValue, m_temp, m_rightTerm, m_leftTerm, m_invNormSquare, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), sliceInputGrad, sliceThisGrad);
         }
@@ -2681,9 +2681,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
         }
@@ -2961,13 +2961,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("StrideTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (m_StrideDim == 1) /// column stride
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
@@ -2995,7 +2995,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
@@ -3022,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3047,7 +3047,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3127,13 +3127,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
                 FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
             if (m_StrideDim == 1)
                 FunctionValues().Resize(rows0, cols1);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_Stride, m_StrideDim);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index eca814210627..07182f8f093a 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,8 +81,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
         }
@@ -206,10 +206,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -265,10 +265,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,10 +508,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, m_diff, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -616,10 +616,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, m_softmax, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -727,8 +727,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             //get the right slice 
             const size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceGradientValue = m_gradientValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceGradientValue = m_gradientValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 
             switch (inputIndex)
             {
@@ -738,40 +738,40 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                        Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 1:
                 {
-                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                       if (colsPrior == 1)
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 2:
                 {
-                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                     if (colsPrior == 1)
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 3:
                 {
-                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -888,11 +888,11 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
 
             //get the right slice 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (colsPrior == 1)
             {
@@ -901,12 +901,12 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-                Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-                Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceUnnormedPrior, sliceMean, sliceLogstddev, sliceFeature,
                     slicePrior, sliceStddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
@@ -1113,13 +1113,13 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
             if (m_dropoutRate > 0)
             {
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             }
 
             ComputeInputPartialS(m_dropoutRate, sliceInput0Grad, sliceMask, sliceOutputGrad);
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1151,10 +1151,10 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             {
                 FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
                 m_maskOfDropout.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             }
 
-            sliceOutputValue = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            sliceOutputValue = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
         }
@@ -1405,8 +1405,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRows);
         }
@@ -1449,8 +1450,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRows);
         }
@@ -1646,8 +1648,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
         }
@@ -1673,8 +1675,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index d0a73d2bac4e..2936774923d6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -606,18 +606,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> error = GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences());
+                    Matrix<ElemType> error = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
                     Matrix<ElemType> grdToObsSlice(this->m_deviceId);
 
@@ -666,7 +666,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         grdToPrevState,
                         m_tempMatrix
                     );
-                    grdToObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, timeIdxInSeq, GetNumParallelSequences()).SetValue(grdToObsSlice);
+                    grdToObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout).SetValue(grdToObsSlice);
 
                     PrepareErrors(timeIdxInSeq, grdToPrevOutput, grdToPrevState, GetNumParallelSequences(), &m_pMBLayout->GetM());
                 }
@@ -997,16 +997,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
-                    Matrix<ElemType> sliceTanhInput = tanhObs.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t(), GetNumParallelSequences());
+                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceTanhInput = tanhObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
                     PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, &m_pMBLayout->GetM());
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 6f608b529d88..c16fc6af5ec6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -879,9 +879,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t sz = 0;
             for (size_t t = 0; t < nT; t++)
             {
-                FrameRange frameRange(t, 1);
+                FrameRange frameRange(t, 1);    // TODO: change to frameRange over a whole MB with a sequence index. BUGBUG: below code will break until this is fixed
                 /// compute prb - 1 and prb
-                Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
+                Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 size_t c_t = (size_t)lbl_t(1, 0);
                 size_t lft_bnd = (size_t)lbl_t(2, 0);
                 size_t rgt_bnd = (size_t)lbl_t(3, 0);
@@ -890,14 +890,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     continue;
 
                 Matrix<ElemType> input_weight_t = Inputs(2)->FunctionValues().ColumnSlice(lft_bnd, nbr_wrd);
-                Matrix<ElemType> obs = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
+                Matrix<ElemType> obs = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
-                Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
+                Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
 
                 switch (inputIndex){
                 case 1:
                     /// gradient to input
-                    grd_t = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
+                    grd_t = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     ComputeInputPartialRight(input_weight_t, grd_t, grd_to_soft_max_input);
                     break;
                 case 2:
@@ -906,8 +906,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputeInputPartialLeft(obs, grd_to_wgt_t, grd_to_soft_max_input);
                     break;
                 case 3:
-                    grd_t = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
-                    grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1));
+                    grd_t = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout));
                     ComputeCEPartialToSoftmaxInputs(grd_t, GradientValues(), c_t);
                     break;
                 default:
@@ -947,7 +947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     FrameRange frameRange(t, 1);
                     /// compute prb - 1 and prb
-                    Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete the next two parameters*/, t, 1);
+                    Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     size_t y_t = (size_t)lbl_t(0, 0);
                     size_t lft_bnd = (size_t)lbl_t(2, 0);
                     size_t rgt_bnd = (size_t)lbl_t(3, 0);
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index e65ed384a7a0..892f6b784383 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -773,6 +773,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     // special convenience function to apply ColumnSlice() to getting a frame range
+#if 0
     // It assumes that columns are frames, and returns a sub-range.
     // TODO: decide whether this belongs here or elsewhere
     // TODO: remove this one, as it does not take #slices explicitly, which will be needed in the future
@@ -788,6 +789,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             LogicError("FrameSlice: FrameRange object gives different range than original explicit code. Logic is borked.");
         return ColumnSlice(frameRange.StartColumn(), frameRange.NumCols());
     }
+#endif
     template<class ElemType>
     Matrix<ElemType> Matrix<ElemType>::FrameSlice(const FrameRange & frameRange, const shared_ptr<MBLayout> & pMBLayout) const
     {
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index db99e0875956..89548709374a 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -146,7 +146,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
 
         // special convenience function to apply ColumnSlice() to getting a frame range
-        Matrix<ElemType> FrameSlice(const struct FrameRange & frameRange, size_t expectedStartColumn, size_t expectedNumCols) const;
+        //Matrix<ElemType> FrameSlice(const struct FrameRange & frameRange, size_t expectedStartColumn, size_t expectedNumCols) const;
         Matrix<ElemType> FrameSlice(const struct FrameRange & frameRange, const shared_ptr<struct MBLayout> & pMBLayout) const;
 
         // difference between AssignColumnSlice and SetColumnSlice 
@@ -596,6 +596,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
     //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)
     // TODO: This will in the future be able to hold sub-ranges for nested loops as well.
+    // BUGBUG: These are currently broken and will need to be fixed:
+    //  - ClassBasedCrossEntropyWithSoftmaxNode:
+    //      FrameRange frameRange(t, 1);
+    //    using a different #sequences. Solve by treating all frames as one sequence (in FrameRange)
+    //  - ReshapeNode:
+    //      Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+    //    using a differeren #sequences. Find out what this really means.
     struct FrameRange
     {
         const size_t timeIdxInSeq;              // start frame
@@ -615,6 +622,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); VerifyMBLayout(pMBLayout); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
         size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); VerifyMBLayout(pMBLayout); return pMBLayout->GetNumParallelSequences(); }
         bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
+
+        const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols) const
+        {
+            if (!IsAllFrames() && expectedStartColumn != StartColumn() || expectedNumCols != NumCols())
+                LogicError("FrameSlice: FrameRange object gives different range than original explicit code. Logic is borked.");
+            return *this;
+        }
     private:
         FrameRange(const FrameRange & other);// : timeIdxInSeq(other.timeIdxInSeq), numFrames(other.numFrames) { }
         void operator=(const FrameRange &);

From 94e354e08dd8ce114e0d122d05423370b780914e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 11:46:00 -0700
Subject: [PATCH 23/44] (made gcc happy)

---
 MachineLearning/CNTKComputationNetworkLib/ComputationNode.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 35427db341a4..390cce90db54 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -1278,6 +1278,7 @@ protected:  \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
     using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
+    using Base::DataSlice; using Base::OUTPUT; using Base::VALUE; using Base::GRADIENT; \
     using Base::m_children; using Base::m_deviceId; using Base::m_evalTimeStamp; using Base::m_functionValues; using Base::m_gradientValues; \
     using Base::m_inputChannels; using Base::m_inputHeight; using Base::m_inputWidth; using Base::m_needGradient; using Base::m_nodeName; \
     using Base::m_outputChannels; using Base::m_outputHeight; using Base::m_outputWidth; using Base::s_constOnes; using Base::s_timeStampCounter; \

From 23e6f5833fdbbee97dc516a78a9880eb23451955 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 11:58:35 -0700
Subject: [PATCH 24/44] changed all Inputs(n)...FrameSlice to DataSlice(n, ...)

---
 .../CompositeComputationNodes.h               |   4 +-
 .../ComputationNode.h                         |   4 +-
 .../ConvolutionalNodes.h                      |  12 +-
 .../InputAndParamNodes.h                      |   8 +-
 .../LinearAlgebraNodes.h                      | 108 +++++++++---------
 .../NonlinearityNodes.h                       |  54 ++++-----
 .../RecurrentNodes.h                          |   6 +-
 .../TrainingCriterionNodes.h                  |  10 +-
 8 files changed, 104 insertions(+), 102 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 0ef020705100..6e0c2e07c1da 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
@@ -690,7 +690,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 390cce90db54..3b9a4145608d 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -1003,7 +1003,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         enum ValueOrGradient { VALUE, GRADIENT };
         Matrix<ElemType> DataSlice(size_t index/*input index or OUT*/,
                                    ValueOrGradient valueOrGradient/*as it says*/,
-                                   const FrameRange & frameRange/*select frame or entire batch*/, size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
+                                   const FrameRange & frameRange/*select frame or entire batch*/,
+                                   const MBLayoutPtr &, // DELETE THIS after refactoring; it's a dummy left-over
+                                   size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
         {
             ComputationNode<ElemType> * node = (index == OUTPUT) ? this : Inputs(index).get();
             Matrix<ElemType> & data = (valueOrGradient == VALUE) ? node->FunctionValues() : node->GradientValues();
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index a497972dbc28..c3d7dbc31655 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -112,13 +112,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("Convolution operation only takes two inputs.");
 
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,7 +215,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
@@ -433,10 +433,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
@@ -447,7 +447,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 02c61c76d794..06e9331e5079 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -349,13 +349,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -402,7 +402,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -585,7 +585,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            mTmp.SetValue(Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
+            mTmp.SetValue(DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index 4ea368de7937..aa5c8529e6e4 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -71,7 +71,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -138,7 +138,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -156,7 +156,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -233,7 +233,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -251,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -370,7 +370,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
@@ -388,7 +388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
@@ -624,13 +624,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -654,7 +654,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -743,13 +743,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -801,7 +801,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -916,13 +916,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -969,7 +969,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -1100,8 +1100,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1252,8 +1252,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1357,13 +1357,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,7 +1403,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
@@ -1589,20 +1589,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1895,20 +1895,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2052,12 +2052,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,7 +2083,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
@@ -2205,8 +2205,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -2280,8 +2280,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
@@ -2430,15 +2430,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,8 +2461,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
@@ -2564,8 +2564,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -2681,8 +2681,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
@@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
@@ -2995,7 +2995,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
@@ -3022,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3047,7 +3047,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3127,7 +3127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
                 FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index 07182f8f093a..1a2566ffadac 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,7 +81,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
@@ -206,7 +206,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -265,7 +265,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,7 +508,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -616,7 +616,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -738,7 +738,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceUnnormedPriorGradient = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
@@ -751,7 +751,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceMeanGradient = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
@@ -763,7 +763,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceLotStddevGradient = DataSlice(2, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
@@ -771,7 +771,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             case 3:
                 {
                     Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceFeatureGradient = DataSlice(3, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -889,7 +889,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             //get the right slice 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFeature = DataSlice(3, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -901,9 +901,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceUnnormedPrior = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceMean = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceLogstddev = DataSlice(2, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -1113,7 +1113,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1405,7 +1405,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
@@ -1450,7 +1450,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
@@ -1648,7 +1648,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
@@ -1675,7 +1675,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 2936774923d6..02afa697c3d9 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -225,7 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 d = (int)functionValues.Mod((float)delayedIndex, (float)delayedActivation.GetNumCols());
             // this can point to the past activity of the previous minibatch
 
-            Matrix<ElemType> out = DataSlice(OUTPUT, VALUE, frameRange);
+            Matrix<ElemType> out = DataSlice(OUTPUT, VALUE, frameRange, m_pMBLayout);
             Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
 
             if (minibatchPackingFlags & SequenceStart_or_End)
@@ -606,7 +606,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
@@ -997,7 +997,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index c16fc6af5ec6..bcdbdf0a9135 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -881,7 +881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 FrameRange frameRange(t, 1);    // TODO: change to frameRange over a whole MB with a sequence index. BUGBUG: below code will break until this is fixed
                 /// compute prb - 1 and prb
-                Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> lbl_t = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 size_t c_t = (size_t)lbl_t(1, 0);
                 size_t lft_bnd = (size_t)lbl_t(2, 0);
                 size_t rgt_bnd = (size_t)lbl_t(3, 0);
@@ -890,14 +890,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     continue;
 
                 Matrix<ElemType> input_weight_t = Inputs(2)->FunctionValues().ColumnSlice(lft_bnd, nbr_wrd);
-                Matrix<ElemType> obs = Inputs(1)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> obs = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
 
                 switch (inputIndex){
                 case 1:
                     /// gradient to input
-                    grd_t = Inputs(1)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     ComputeInputPartialRight(input_weight_t, grd_t, grd_to_soft_max_input);
                     break;
                 case 2:
@@ -906,7 +906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputeInputPartialLeft(obs, grd_to_wgt_t, grd_to_soft_max_input);
                     break;
                 case 3:
-                    grd_t = Inputs(3)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = DataSlice(3, GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout));
                     ComputeCEPartialToSoftmaxInputs(grd_t, GradientValues(), c_t);
                     break;
@@ -947,7 +947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     FrameRange frameRange(t, 1);
                     /// compute prb - 1 and prb
-                    Matrix<ElemType> lbl_t = Inputs(0)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    Matrix<ElemType> lbl_t = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     size_t y_t = (size_t)lbl_t(0, 0);
                     size_t lft_bnd = (size_t)lbl_t(2, 0);
                     size_t rgt_bnd = (size_t)lbl_t(3, 0);

From 3967ba9e8b2afa20b81e1022f82211418061e957 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 12:07:34 -0700
Subject: [PATCH 25/44] removed the input index from DataSlice(), seems more
 clear to say Inputs(n)->DataSlice(...)

---
 .../CompositeComputationNodes.h               |   4 +-
 .../ComputationNode.h                         |   9 +-
 .../ConvolutionalNodes.h                      |  12 +-
 .../InputAndParamNodes.h                      |   8 +-
 .../LinearAlgebraNodes.h                      | 108 +++++++++---------
 .../NonlinearityNodes.h                       |  54 ++++-----
 .../RecurrentNodes.h                          |   6 +-
 .../TrainingCriterionNodes.h                  |  10 +-
 8 files changed, 104 insertions(+), 107 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 6e0c2e07c1da..13d9576279cd 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
@@ -690,7 +690,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 3b9a4145608d..895294514940 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -998,17 +998,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
         // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
         // TODO: move sequence into FrameRange object
-        enum Index : size_t { OUTPUT = SIZE_MAX };
 #define SEQUENCE_ALL SIZE_MAX
         enum ValueOrGradient { VALUE, GRADIENT };
-        Matrix<ElemType> DataSlice(size_t index/*input index or OUT*/,
-                                   ValueOrGradient valueOrGradient/*as it says*/,
+        Matrix<ElemType> DataSlice(ValueOrGradient valueOrGradient/*as it says*/,
                                    const FrameRange & frameRange/*select frame or entire batch*/,
                                    const MBLayoutPtr &, // DELETE THIS after refactoring; it's a dummy left-over
                                    size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
         {
-            ComputationNode<ElemType> * node = (index == OUTPUT) ? this : Inputs(index).get();
-            Matrix<ElemType> & data = (valueOrGradient == VALUE) ? node->FunctionValues() : node->GradientValues();
+            Matrix<ElemType> & data = (valueOrGradient == VALUE) ? FunctionValues() : GradientValues();
             if (frameRange.IsAllFrames())
             {
                 if (sequence == SEQUENCE_ALL)
@@ -1280,7 +1277,7 @@ protected:  \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
     using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
-    using Base::DataSlice; using Base::OUTPUT; using Base::VALUE; using Base::GRADIENT; \
+    using Base::DataSlice; using Base::VALUE; using Base::GRADIENT; \
     using Base::m_children; using Base::m_deviceId; using Base::m_evalTimeStamp; using Base::m_functionValues; using Base::m_gradientValues; \
     using Base::m_inputChannels; using Base::m_inputHeight; using Base::m_inputWidth; using Base::m_needGradient; using Base::m_nodeName; \
     using Base::m_outputChannels; using Base::m_outputHeight; using Base::m_outputWidth; using Base::s_constOnes; using Base::s_timeStampCounter; \
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index c3d7dbc31655..3541a258adf7 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -112,13 +112,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("Convolution operation only takes two inputs.");
 
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,7 +215,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
@@ -433,10 +433,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
@@ -447,7 +447,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 06e9331e5079..60e1bf0c1a5a 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -349,13 +349,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -402,7 +402,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -585,7 +585,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            mTmp.SetValue(DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
+            mTmp.SetValue(Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index aa5c8529e6e4..6af7eab08a5d 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,7 +53,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -71,7 +71,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -138,7 +138,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -156,7 +156,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -233,7 +233,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
@@ -251,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -370,7 +370,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
@@ -388,7 +388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
@@ -624,13 +624,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -654,7 +654,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -743,13 +743,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -801,7 +801,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -916,13 +916,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex == 0)  //left derivative
             {
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
@@ -969,7 +969,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -1100,8 +1100,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1252,8 +1252,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1357,13 +1357,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,7 +1403,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
@@ -1589,20 +1589,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1895,20 +1895,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2052,12 +2052,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,7 +2083,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
@@ -2205,8 +2205,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -2280,8 +2280,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
@@ -2430,15 +2430,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,8 +2461,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
@@ -2564,8 +2564,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -2681,8 +2681,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
@@ -2967,7 +2967,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
@@ -2995,7 +2995,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
@@ -3022,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3047,7 +3047,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3127,7 +3127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
                 FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index 1a2566ffadac..e363473a2155 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,7 +81,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
@@ -206,7 +206,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -265,7 +265,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,7 +508,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -616,7 +616,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -738,7 +738,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
@@ -751,7 +751,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
@@ -763,7 +763,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = DataSlice(2, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
@@ -771,7 +771,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             case 3:
                 {
                     Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceFeatureGradient = DataSlice(3, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -889,7 +889,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             //get the right slice 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceFeature = DataSlice(3, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFeature = Inputs(3)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -901,9 +901,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceMean = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceLogstddev = DataSlice(2, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceMean = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -1113,7 +1113,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1405,7 +1405,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
@@ -1450,7 +1450,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
@@ -1648,7 +1648,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
@@ -1675,7 +1675,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = DataSlice(0, GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 02afa697c3d9..132296b4daa3 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -225,7 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 d = (int)functionValues.Mod((float)delayedIndex, (float)delayedActivation.GetNumCols());
             // this can point to the past activity of the previous minibatch
 
-            Matrix<ElemType> out = DataSlice(OUTPUT, VALUE, frameRange, m_pMBLayout);
+            Matrix<ElemType> out = DataSlice(VALUE, frameRange, m_pMBLayout);
             Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
 
             if (minibatchPackingFlags & SequenceStart_or_End)
@@ -606,7 +606,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
@@ -997,7 +997,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index bcdbdf0a9135..832419ab2882 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -881,7 +881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 FrameRange frameRange(t, 1);    // TODO: change to frameRange over a whole MB with a sequence index. BUGBUG: below code will break until this is fixed
                 /// compute prb - 1 and prb
-                Matrix<ElemType> lbl_t = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> lbl_t = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 size_t c_t = (size_t)lbl_t(1, 0);
                 size_t lft_bnd = (size_t)lbl_t(2, 0);
                 size_t rgt_bnd = (size_t)lbl_t(3, 0);
@@ -890,14 +890,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     continue;
 
                 Matrix<ElemType> input_weight_t = Inputs(2)->FunctionValues().ColumnSlice(lft_bnd, nbr_wrd);
-                Matrix<ElemType> obs = DataSlice(1, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> obs = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
 
                 switch (inputIndex){
                 case 1:
                     /// gradient to input
-                    grd_t = DataSlice(1, GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     ComputeInputPartialRight(input_weight_t, grd_t, grd_to_soft_max_input);
                     break;
                 case 2:
@@ -906,7 +906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputeInputPartialLeft(obs, grd_to_wgt_t, grd_to_soft_max_input);
                     break;
                 case 3:
-                    grd_t = DataSlice(3, GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = Inputs(3)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout));
                     ComputeCEPartialToSoftmaxInputs(grd_t, GradientValues(), c_t);
                     break;
@@ -947,7 +947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     FrameRange frameRange(t, 1);
                     /// compute prb - 1 and prb
-                    Matrix<ElemType> lbl_t = DataSlice(0, VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    Matrix<ElemType> lbl_t = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     size_t y_t = (size_t)lbl_t(0, 0);
                     size_t lft_bnd = (size_t)lbl_t(2, 0);
                     size_t rgt_bnd = (size_t)lbl_t(3, 0);

From acfd0108c38bc7c352dde195e2c5db80bb4dcbc7 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 13:32:07 -0700
Subject: [PATCH 26/44] switched DataSlice(VALUE/GRADIENT...) to
 Value/GradientSlice()

---
 .../CompositeComputationNodes.h               |   4 +-
 .../ComputationNode.h                         |  33 +++-
 .../ConvolutionalNodes.h                      |  16 +-
 .../InputAndParamNodes.h                      |  18 +-
 .../LinearAlgebraNodes.h                      | 174 +++++++++---------
 .../NonlinearityNodes.h                       |  78 ++++----
 .../RecurrentNodes.h                          |  12 +-
 .../TrainingCriterionNodes.h                  |  10 +-
 Math/Math/Matrix.h                            |   2 +-
 9 files changed, 180 insertions(+), 167 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 13d9576279cd..667808ed19aa 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,7 +539,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
@@ -690,7 +690,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 895294514940..83b5d75f9062 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -997,18 +997,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Note: This returns an object, not a reference. That object is a column slice, i.e. a small object that just points into another object.
         // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
         // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
-        // TODO: move sequence into FrameRange object
-#define SEQUENCE_ALL SIZE_MAX
-        enum ValueOrGradient { VALUE, GRADIENT };
-        Matrix<ElemType> DataSlice(ValueOrGradient valueOrGradient/*as it says*/,
+        Matrix<ElemType> DataSlice(Matrix<ElemType> & data,
                                    const FrameRange & frameRange/*select frame or entire batch*/,
-                                   const MBLayoutPtr &, // DELETE THIS after refactoring; it's a dummy left-over
-                                   size_t sequence = SEQUENCE_ALL/*SEQUENCE_ALL is the normal case*/)
+                                   const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
         {
-            Matrix<ElemType> & data = (valueOrGradient == VALUE) ? FunctionValues() : GradientValues();
+            auto sequence = SIZE_MAX;
             if (frameRange.IsAllFrames())
             {
-                if (sequence == SEQUENCE_ALL)
+                if (sequence == SIZE_MAX)
                     return data.ColumnSlice(0, data.GetNumCols());
                 else
                     LogicError("DataSlice: sequence index only supported when accessing individual frame"); // (not needed; doable but more involved, requiring a reshape)
@@ -1019,12 +1015,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 if (numParallelSequences != frameRange.samplesInRecurrentStep)
                     LogicError("DataSlice: inconsistent samplesInRecurrentStep");   // TODO: this will go away when we remove this memebr from FrameRange
                 size_t startColumn = frameRange.t() * numParallelSequences;
-                if (sequence == SEQUENCE_ALL)
+                if (sequence == SIZE_MAX)
                     return data.ColumnSlice(startColumn, numParallelSequences);
                 else
                     return data.ColumnSlice(startColumn + sequence, 1);
             }
-            // TODO:
+        }
+        enum ValueOrGradient { VALUE, GRADIENT };
+        Matrix<ElemType> DataSlice(ValueOrGradient valueOrGradient/*as it says*/,
+            const FrameRange & frameRange/*select frame or entire batch*/,
+            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+        {
+            Matrix<ElemType> & data = (valueOrGradient == VALUE) ? FunctionValues() : GradientValues();
+            return DataSlice(data, frameRange, m_pMBLayout);
+        }
+        Matrix<ElemType> ValueSlice(const FrameRange & frameRange/*select frame or entire batch*/,
+            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+        {
+            return DataSlice(FunctionValues(), frameRange, m_pMBLayout);
+        }
+        Matrix<ElemType> GradientSlice(const FrameRange & frameRange/*select frame or entire batch*/,
+            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+        {
+            return DataSlice(GradientValues(), frameRange, m_pMBLayout);
         }
 
         // this is the entry point from Network; while it will call virtual ComputeInputPartial() into the actual node implementation
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index 3541a258adf7..c42109f7f062 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -111,14 +111,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("Convolution operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,7 +215,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
@@ -433,10 +433,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
@@ -447,7 +447,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 60e1bf0c1a5a..6e54fdc75220 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -348,15 +348,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -402,7 +402,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -573,8 +573,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_functionValues.GetNumRows() == GradientValues().GetNumRows()); // original used m_functionValues.GetNumRows() for loop dimension
             assert(m_pMBLayout);
 
-            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType>::ScaleAndAdd(1.0, GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout), mTmp);
+            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType>::ScaleAndAdd(1.0, GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout), mTmp);
         }
 
         virtual void EvaluateThisNode()
@@ -584,8 +584,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> mTmp = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            mTmp.SetValue(Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
+            Matrix<ElemType> mTmp = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            mTmp.SetValue(Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index 6af7eab08a5d..322ecce8f238 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,8 +53,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -71,7 +71,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -138,8 +138,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -156,7 +156,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -233,8 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -251,7 +251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
@@ -370,8 +370,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
         }
@@ -388,7 +388,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
@@ -486,8 +486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex >= ChildrenSize())
                 InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
         }
@@ -504,7 +504,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceFunctionValues = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFunctionValues = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
         }
@@ -623,15 +623,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //left Node must be a scalar
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -654,7 +654,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -742,15 +742,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -801,7 +801,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -915,15 +915,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -969,7 +969,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
@@ -1073,10 +1073,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad);
         }
@@ -1100,8 +1100,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1202,10 +1202,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("RowElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)
             {
@@ -1252,8 +1252,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
@@ -1353,17 +1353,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ColumnElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,7 +1403,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
@@ -1509,13 +1509,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (cols0 >= cols1)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialS(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
             }
@@ -1589,20 +1589,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1780,11 +1780,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> ones = Matrix<ElemType>();
 
@@ -1895,20 +1895,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2048,16 +2048,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("DiagTimes operation only takes two inputs.");
 
             //left parameter (diag matix cannot be sliced)
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,7 +2083,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
@@ -2205,11 +2205,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = this->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = this->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
@@ -2280,8 +2280,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
@@ -2426,19 +2426,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("KhatriRaoProduct operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,8 +2461,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
@@ -2564,11 +2564,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceThisGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceThisGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(inputIndex, m_invNorm0, m_invNorm1, sliceOutputValue, m_temp, m_rightTerm, m_leftTerm, m_invNormSquare, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), sliceInputGrad, sliceThisGrad);
         }
@@ -2681,8 +2681,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
@@ -2961,13 +2961,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("StrideTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             if (m_StrideDim == 1) /// column stride
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
@@ -2995,7 +2995,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
@@ -3022,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3047,7 +3047,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3127,7 +3127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
                 FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index e363473a2155..efff274b6ff5 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,7 +81,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
@@ -206,8 +206,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
@@ -265,8 +265,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,8 +508,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
@@ -616,8 +616,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
@@ -738,7 +738,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
@@ -751,7 +751,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
@@ -763,7 +763,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
@@ -771,7 +771,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             case 3:
                 {
                     Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -889,7 +889,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             //get the right slice 
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceFeature = Inputs(3)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFeature = Inputs(3)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -901,9 +901,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceMean = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceMean = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
                 Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
                 Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
@@ -1113,8 +1113,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
             if (m_dropoutRate > 0)
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1154,7 +1154,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                 sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             }
 
-            sliceOutputValue = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
         }
@@ -1405,7 +1405,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
@@ -1450,9 +1450,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRows);
         }
@@ -1648,7 +1648,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
             Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
@@ -1675,8 +1675,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 132296b4daa3..e6f45eb57909 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -225,7 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 d = (int)functionValues.Mod((float)delayedIndex, (float)delayedActivation.GetNumCols());
             // this can point to the past activity of the previous minibatch
 
-            Matrix<ElemType> out = DataSlice(VALUE, frameRange, m_pMBLayout);
+            Matrix<ElemType> out = ValueSlice(frameRange, m_pMBLayout);
             Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
 
             if (minibatchPackingFlags & SequenceStart_or_End)
@@ -606,8 +606,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
                     Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
@@ -617,7 +617,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
-                    Matrix<ElemType> error = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> error = GradientSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
 
                     Matrix<ElemType> grdToObsSlice(this->m_deviceId);
 
@@ -997,8 +997,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceOutput = FunctionValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
                     Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
 
                     Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 832419ab2882..e15d802c9250 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -881,7 +881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 FrameRange frameRange(t, 1);    // TODO: change to frameRange over a whole MB with a sequence index. BUGBUG: below code will break until this is fixed
                 /// compute prb - 1 and prb
-                Matrix<ElemType> lbl_t = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 size_t c_t = (size_t)lbl_t(1, 0);
                 size_t lft_bnd = (size_t)lbl_t(2, 0);
                 size_t rgt_bnd = (size_t)lbl_t(3, 0);
@@ -890,14 +890,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     continue;
 
                 Matrix<ElemType> input_weight_t = Inputs(2)->FunctionValues().ColumnSlice(lft_bnd, nbr_wrd);
-                Matrix<ElemType> obs = Inputs(1)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> obs = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
 
                 switch (inputIndex){
                 case 1:
                     /// gradient to input
-                    grd_t = Inputs(1)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     ComputeInputPartialRight(input_weight_t, grd_t, grd_to_soft_max_input);
                     break;
                 case 2:
@@ -906,7 +906,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputeInputPartialLeft(obs, grd_to_wgt_t, grd_to_soft_max_input);
                     break;
                 case 3:
-                    grd_t = Inputs(3)->DataSlice(GRADIENT, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout));
                     ComputeCEPartialToSoftmaxInputs(grd_t, GradientValues(), c_t);
                     break;
@@ -947,7 +947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     FrameRange frameRange(t, 1);
                     /// compute prb - 1 and prb
-                    Matrix<ElemType> lbl_t = Inputs(0)->DataSlice(VALUE, frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
                     size_t y_t = (size_t)lbl_t(0, 0);
                     size_t lft_bnd = (size_t)lbl_t(2, 0);
                     size_t rgt_bnd = (size_t)lbl_t(3, 0);
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 89548709374a..e5a57c058552 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -601,7 +601,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //      FrameRange frameRange(t, 1);
     //    using a different #sequences. Solve by treating all frames as one sequence (in FrameRange)
     //  - ReshapeNode:
-    //      Matrix<ElemType> sliceOutputGrad = GradientValues().FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+    //      Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
     //    using a differeren #sequences. Find out what this really means.
     struct FrameRange
     {

From 025174b9a132a852de51294395d077959251f1c4 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Mon, 21 Sep 2015 13:44:15 -0700
Subject: [PATCH 27/44] Fixed some linker errors in the CNTKMathTest unit test
 project

---
 Math/Math/GPUSparseMatrix.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu
index c53ade5dd445..5a2e5da61957 100644
--- a/Math/Math/GPUSparseMatrix.cu
+++ b/Math/Math/GPUSparseMatrix.cu
@@ -2401,8 +2401,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 #pragma endregion Helper Functions
 
-    template class GPUSparseMatrix<float>; 
-    template class GPUSparseMatrix<double>;    
+    template class MATH_API GPUSparseMatrix<float>; 
+    template class MATH_API GPUSparseMatrix<double>;
 
     // We use Matrix<char> as the backing store for QuantizedMatrix
     // Let's explciitly instantiate the methods we need for that purpose

From dc92c384cd6e6e637f5a24d11b408e34dcf5cb10 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Mon, 21 Sep 2015 13:59:05 -0700
Subject: [PATCH 28/44] Include CNTKMathTest in the VS build to avoid build
 issues in the project from getting in undetected

---
 CNTK.sln | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/CNTK.sln b/CNTK.sln
index b68c8721c7ff..956df91c529a 100644
--- a/CNTK.sln
+++ b/CNTK.sln
@@ -1,7 +1,7 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
-VisualStudioVersion = 12.0.21005.1
+VisualStudioVersion = 12.0.31101.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathDll", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
@@ -351,7 +351,9 @@ Global
 		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.ActiveCfg = Release|x64
 		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.Build.0 = Release|x64
 		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.ActiveCfg = Debug|x64
+		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.Build.0 = Debug|x64
 		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Release|x64.ActiveCfg = Release|x64
+		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Release|x64.Build.0 = Release|x64
 		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.ActiveCfg = Debug|x64
 		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.Build.0 = Debug|x64
 		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Release|x64.ActiveCfg = Release|x64
@@ -377,6 +379,7 @@ Global
 		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.ActiveCfg = Debug|x64
 		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.Build.0 = Debug|x64
 		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Release|x64.ActiveCfg = Release|x64
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Release|x64.Build.0 = Release|x64
 		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.ActiveCfg = Debug|x64
 		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.Build.0 = Debug|x64
 		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Release|x64.ActiveCfg = Release|x64
@@ -416,38 +419,38 @@ Global
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(NestedProjects) = preSolution
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94}
 		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{065AF55D-AF02-448B-BFCD-52619FDA4BD0} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 		{3ED0465D-23E7-4855-9694-F788717B6533} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{065AF55D-AF02-448B-BFCD-52619FDA4BD0} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
 		{98D2C32B-0C1F-4E19-A626-65F7BA4600CF} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{EA67F51F-1FE8-462D-9F3E-01161685AD59} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{63024704-A2D7-497E-AD4B-5C10C6AA1374} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
 		{F9BEB27E-8AF5-464E-8D45-0000D5AFA2D3} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
 		{889C1CCF-92B3-450B-B00D-FC9A9D5BE464} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
+		{DBB3C106-B0B4-4059-8477-C89528CEC1B0} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{4BBF2950-3DBD-469A-AD57-6CACBEBAF541} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{5F733BBA-FE83-4668-8F83-8B0E78A36619} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{19EE975B-232D-49F0-94C7-6F1C6424FB53} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
+		{7C4E77C9-6B17-4B02-82C1-DB62EEE2635B} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{5E666C53-2D82-49C9-9127-3FDDC321C741} = {D45DF403-6781-444E-B654-A96868C5BE68}
 		{6D1353D6-F196-466F-B886-F16D48759B20} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741}
 		{B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884}

From 207bfec369da60e23a72466e070f55c5ad527948 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 14:05:02 -0700
Subject: [PATCH 29/44] changed X.FrameSlice(...) to DataSlice(X, ...);
 discovered that many LinearAlgebraNodes and NonlinearityNodes accessed
 m_functionValues instead of going through the virtual FunctionValues(), I
 hope that that was a bug, and that changing that (to DataSlice(), which calls
 FunctionValues()) did not break anythingl moved pMBLayout in DataSlice calls
 to be an argument to the FrameRange::Check() instead

---
 .../CompositeComputationNodes.h               |  12 +-
 .../ComputationNode.h                         |  21 +-
 .../ConvolutionalNodes.h                      |  22 +-
 .../InputAndParamNodes.h                      |  19 +-
 .../LinearAlgebraNodes.h                      | 216 +++++++++---------
 .../NonlinearityNodes.h                       | 120 +++++-----
 .../RecurrentNodes.h                          |  42 ++--
 .../TrainingCriterionNodes.h                  |  14 +-
 Math/Math/Matrix.h                            |   6 +-
 9 files changed, 234 insertions(+), 238 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 667808ed19aa..077daa841cbe 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -539,8 +539,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -690,8 +690,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
             //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues());
         }
@@ -840,9 +840,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //FunctionValues().Resize(m_memory.GetNumRows(), GetNumParallelSequences());
             FunctionValues().Resize(m_memory.GetNumRows(), frameRange.NumCols());   // extra space for one time step
             if (frameRange.t() == 0)    // for first frame, check that we got all in memory  --TODO: is this comment correct? How about going backwards?
-                assert(FunctionValues().FrameSlice(FrameRange(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm() == m_memory.FrameSlice(FrameRange(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm());
+                assert(ValueSlice(FrameRange(0, GetNumParallelSequences())).FrobeniusNorm() == DataSlice(m_memory, FrameRange(0, GetNumParallelSequences())).FrobeniusNorm());
                 //assert(FunctionValues().ColumnSlice(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm() == m_memory.ColumnSlice(0, GetNumParallelSequences()), m_pMBLayout).FrobeniusNorm());
-            FunctionValues().SetValue(m_memory.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
+            FunctionValues().SetValue(DataSlice(m_memory, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout)));
             assert(FunctionValues().GetNumCols() == GetNumParallelSequences());
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 83b5d75f9062..71e06618c46e 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -986,7 +986,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_children[childIndex] = node;
         }
 
-        //making them virtual so that nodes that only copy values from it's children (e.g., dropout) can be efficient in evaluation
+        // these are overridden by DropoutNode, ReshapeNode, and RowRepeatNode to optimize for the trivial case that those don't do anything
+        // TODO: lots of nodes read out m_functionValues directly--was that a bug or intentional? They have now been changed to ValueSlice(), i.e. would pick it up
         virtual const Matrix<ElemType>& FunctionValues() const { return m_functionValues; }
         virtual Matrix<ElemType>& FunctionValues() { return m_functionValues; }
 
@@ -998,8 +999,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // TODO: remove FrameRange::samplesInRecurrentStep from FrameRange, as it belongs into pMBLayout. Hence this function that binds both together.
         // Note: This is not used anywhere yet, only a sketch how we may further abstract timing.
         Matrix<ElemType> DataSlice(Matrix<ElemType> & data,
-                                   const FrameRange & frameRange/*select frame or entire batch*/,
-                                   const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+                                   const FrameRange & frameRange/*select frame or entire batch*/)
         {
             auto sequence = SIZE_MAX;
             if (frameRange.IsAllFrames())
@@ -1023,21 +1023,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         enum ValueOrGradient { VALUE, GRADIENT };
         Matrix<ElemType> DataSlice(ValueOrGradient valueOrGradient/*as it says*/,
-            const FrameRange & frameRange/*select frame or entire batch*/,
-            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+            const FrameRange & frameRange/*select frame or entire batch*/)
         {
             Matrix<ElemType> & data = (valueOrGradient == VALUE) ? FunctionValues() : GradientValues();
-            return DataSlice(data, frameRange, m_pMBLayout);
+            return DataSlice(data, frameRange);
         }
-        Matrix<ElemType> ValueSlice(const FrameRange & frameRange/*select frame or entire batch*/,
-            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+        Matrix<ElemType> ValueSlice(const FrameRange & frameRange/*select frame or entire batch*/)
         {
-            return DataSlice(FunctionValues(), frameRange, m_pMBLayout);
+            return DataSlice(FunctionValues(), frameRange);
         }
-        Matrix<ElemType> GradientSlice(const FrameRange & frameRange/*select frame or entire batch*/,
-            const MBLayoutPtr &) // DELETE THIS after refactoring; it's a dummy left-over)
+        Matrix<ElemType> GradientSlice(const FrameRange & frameRange/*select frame or entire batch*/)
         {
-            return DataSlice(GradientValues(), frameRange, m_pMBLayout);
+            return DataSlice(GradientValues(), frameRange);
         }
 
         // this is the entry point from Network; while it will call virtual ComputeInputPartial() into the actual node implementation
diff --git a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
index c42109f7f062..2da3ba29cfbf 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ConvolutionalNodes.h
@@ -111,14 +111,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("Convolution operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)  //derivative with regard to the weight matrix
                 ComputeInputPartialOverWeight(sliceOutputGrad, Inputs(0)->GradientValues(), Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix, !frameRange.IsAllFrames());
             else  // derivative with regard to the input feature
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                 ComputeInputPartialOverInputFeature(sliceOutputGrad, sliceInput1Grad, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
             }
         }
@@ -215,8 +215,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_tempMatrix);
         }
 
@@ -433,11 +433,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 0)
                 InvalidArgument("MaxPooling operation only takes one inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialV(sliceOutputGrad, sliceInput0Grad, sliceInput0Value, sliceOutputValue);
         }
@@ -447,8 +447,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             EvaluateThisNodeV(sliceOutputValue, sliceInput0Value);
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 6e54fdc75220..aa29b045369a 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -346,17 +346,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("LookupTable operation only takes two inputs.");
 
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -402,8 +401,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -573,8 +572,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_functionValues.GetNumRows() == GradientValues().GetNumRows()); // original used m_functionValues.GetNumRows() for loop dimension
             assert(m_pMBLayout);
 
-            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType>::ScaleAndAdd(1.0, GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout), mTmp);
+            Matrix<ElemType> mTmp = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType>::ScaleAndAdd(1.0, GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout)), mTmp);
         }
 
         virtual void EvaluateThisNode()
@@ -584,8 +583,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> mTmp = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            mTmp.SetValue(Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout));
+            Matrix<ElemType> mTmp = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            mTmp.SetValue(Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout)));
         }
 
         virtual void /*ComputationNodeBase::*/Validate()
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index 322ecce8f238..a91ed0d81ff6 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -53,8 +53,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Negate operation only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -71,8 +71,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
 
@@ -138,8 +138,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -156,8 +156,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -233,8 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("SumColumnElements only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad);
         }
@@ -251,8 +251,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue);
         }
@@ -370,8 +370,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("RowSlice only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startIndex, m_numRows);
         }
@@ -388,8 +388,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_startIndex, m_numRows);
         }
@@ -486,8 +486,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex >= ChildrenSize())
                 InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
         }
@@ -504,7 +504,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceFunctionValues = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceFunctionValues = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
         }
@@ -623,15 +623,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //left Node must be a scalar
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -654,8 +654,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -742,15 +742,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -801,8 +801,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
             FunctionValues().Resize(rows0, cols1);
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -915,15 +915,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
@@ -969,8 +969,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
         }
@@ -1073,10 +1073,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1-inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInput1Value, sliceInput0Grad, sliceOutputGrad);
         }
@@ -1100,9 +1100,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1202,10 +1202,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("RowElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1 - inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)
             {
@@ -1252,9 +1252,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
         }
@@ -1353,17 +1353,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("ColumnElementTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeftS(Inputs(1)->FunctionValues(), sliceInput0Grad, sliceOutputGrad, m_tempMatrix);
             }
             else
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                 ComputeInputPartialRightS(sliceInput0Value, Inputs(1)->GradientValues(), sliceOutputGrad, m_tempMatrix);
             }
         }
@@ -1403,8 +1403,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
         }
@@ -1509,13 +1509,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (cols0 >= cols1)
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialS(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
             }
@@ -1584,25 +1584,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -1780,11 +1780,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //only the one with more columns can be sliced, if both have same columns both are sliced
             size_t cols0 = Inputs(inputIndex)->FunctionValues().GetNumCols(), cols1=Inputs(1-inputIndex)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput0Value = Inputs(inputIndex)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             Matrix<ElemType> ones = Matrix<ElemType>();
 
@@ -1890,25 +1890,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t cols0 = Inputs(0)->FunctionValues().GetNumCols(), cols1=Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             //only the one with more columns can be sliced, if both have same columns both are sliced
             if (cols0 == cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value);
             }
             else if (cols0 > cols1)
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, Inputs(1)->FunctionValues());
             }
             else //cols0 < cols1)
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value);
             }
@@ -2048,16 +2048,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("DiagTimes operation only takes two inputs.");
 
             //left parameter (diag matix cannot be sliced)
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                 ComputeInputPartialLeft(m_innerproduct, sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                 ComputeInputPartialRight(m_rightGradient, Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
             }
         }
@@ -2083,8 +2083,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value); 
         }
@@ -2205,11 +2205,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistance operation only takes two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = this->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = this->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)  //left derivative
             {
@@ -2280,9 +2280,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) 
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value);  
         }
@@ -2426,19 +2426,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("KhatriRaoProduct operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (inputIndex == 0)  //left derivative
             {
-                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialLeft(sliceInput1Value, sliceInput0Grad, sliceOutputGrad); 
             }
             else  //right derivative
             {
-                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 ComputeInputPartialRight(sliceInput0Value, sliceInput1Grad, sliceOutputGrad); 
             }
@@ -2461,9 +2461,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)  
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInput0Value, sliceInput1Value); 
         }
@@ -2564,11 +2564,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
 
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceThisGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceThisGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(inputIndex, m_invNorm0, m_invNorm1, sliceOutputValue, m_temp, m_rightTerm, m_leftTerm, m_invNormSquare, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), sliceInputGrad, sliceThisGrad);
         }
@@ -2681,9 +2681,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(m_invNorm0, m_invNorm1, sliceOutputValue, sliceInput0Value, sliceInput1Value, Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), m_leftTerm, m_rightTerm);
         }
@@ -2961,13 +2961,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex > 1)
                 InvalidArgument("StrideTimes operation only takes two inputs.");
 
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (m_StrideDim == 1) /// column stride
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
 
 //                    TimesNode<ElemType>::ComputeInputPartialLeft(sliceInput1Value, Inputs(0)->GradientValues(), sliceOutputGrad);
@@ -2995,7 +2995,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                     //                    TimesNode<ElemType>::ComputeInputPartialRight(Inputs(0)->FunctionValues(), sliceInput1Grad, sliceOutputGrad);
 
@@ -3022,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 if (inputIndex == 0)  //left derivative
                 {
-                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3047,7 +3047,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
                 else  //right derivative
                 {
-                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceInput1Grad = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
@@ -3127,13 +3127,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t rows0 = Inputs(0)->FunctionValues().GetNumRows(), cols1 = Inputs(1)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput1Value = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             UpdateStride(sliceInput1Value);
             if (m_StrideDim == 0)
                 FunctionValues().Resize(rows0 / GetNumParallelSequences(), cols1);
             if (m_StrideDim == 1)
                 FunctionValues().Resize(rows0, cols1);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), sliceInput1Value, m_Stride, m_StrideDim);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
index efff274b6ff5..7a5a715f76c2 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/NonlinearityNodes.h
@@ -63,11 +63,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //       We should also unify these two functions into one that decides 1 frame or all frames at runtime... through the slice-extractor function itself.
             //       For now we could define ALL_SAMPLES e.g. as SIZE_MAX.
             //       GetGradientSlice(), GetInputSlice() or something.
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             // why GradientValues() but m_functionValues below and not FunctionValues()?
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialV(m_gradient, sliceInputValue, sliceInputGrad, sliceOutputGrad);
         }
@@ -81,8 +81,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeV(sliceOutputValue, sliceInputValue);
         }
@@ -206,10 +206,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Sigmoid only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -265,10 +265,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Tanh only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -326,10 +326,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Log only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -386,10 +386,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Exp only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -445,10 +445,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Cosine only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, sliceInputGrad, sliceInputValue, sliceOutputGrad);
         }
@@ -508,10 +508,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, m_diff, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -616,10 +616,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (inputIndex != 0)
                 InvalidArgument("Softmax only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(m_gradient, m_softmax, sliceInputGrad, sliceOutputGrad, sliceOutputValue);
         }
@@ -727,8 +727,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             //get the right slice 
             const size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
 
-            Matrix<ElemType> sliceGradientValue = m_gradientValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceGradientValue = DataSlice(m_gradientValues, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> slicePosterior = DataSlice(m_posterior, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                 
             switch (inputIndex)
             {
@@ -738,40 +738,40 @@ virtual const std::wstring OperationName() const { return TypeName(); }
                         ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                        Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                        Matrix<ElemType> slicePrior = DataSlice(m_prior, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                         ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 1:
                 {
-                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                      Matrix<ElemType> sliceNormedDeviationVectors = DataSlice(m_normedDeviationVectors, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                       if (colsPrior == 1)
                         ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                         ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 2:
                 {
-                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceNormedDeviation = DataSlice(m_normedDeviation, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                     if (colsPrior == 1)
                         ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     else
                     {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                         ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
                     }
                 }
                 break;
             case 3:
                 {
-                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceNormedDeviationVectors = DataSlice(m_normedDeviationVectors, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
                     ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
                 }
                 break;
@@ -888,11 +888,11 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
 
             //get the right slice 
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceFeature = Inputs(3)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> slicePosterior = m_posterior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceFeature = Inputs(3)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceNormedDeviation = DataSlice(m_normedDeviation, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceNormedDeviationVectors = DataSlice(m_normedDeviationVectors, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> slicePosterior = DataSlice(m_posterior, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             if (colsPrior == 1)
             {
@@ -901,12 +901,12 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
             else if (colsPrior == numSamples)
             {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceMean = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceMean = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-                Matrix<ElemType> slicePrior = m_prior.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-                Matrix<ElemType> sliceStddev = m_stddev.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                Matrix<ElemType> slicePrior = DataSlice(m_prior, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+                Matrix<ElemType> sliceStddev = DataSlice(m_stddev, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
                 EvaluateThisNodeS(sliceOutputValue, sliceUnnormedPrior, sliceMean, sliceLogstddev, sliceFeature,
                     slicePrior, sliceStddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
@@ -1113,13 +1113,13 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex > 0)
                 InvalidArgument("Dropout operation only takes one input.");
 
-            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Grad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
             if (m_dropoutRate > 0)
             {
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                sliceMask = DataSlice(m_maskOfDropout, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             }
 
             ComputeInputPartialS(m_dropoutRate, sliceInput0Grad, sliceMask, sliceOutputGrad);
@@ -1143,7 +1143,7 @@ virtual const std::wstring OperationName() const { return TypeName(); }
         }
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInput0Value = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             Matrix<ElemType> sliceOutputValue = Matrix <ElemType>();
 
             Matrix<ElemType> sliceMask = Matrix<ElemType>();
@@ -1151,10 +1151,10 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             {
                 FunctionValues().Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
                 m_maskOfDropout.Resize(Inputs(0)->FunctionValues().GetNumRows(), Inputs(0)->FunctionValues().GetNumCols());
-                sliceMask = m_maskOfDropout.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+                sliceMask = DataSlice(m_maskOfDropout, frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             }
 
-            sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(m_dropoutRate, m_randomSeed, sliceOutputValue, sliceMask, sliceInput0Value);
         }
@@ -1405,9 +1405,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep, m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRows);
         }
@@ -1450,9 +1450,9 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
             size_t outputSamplesInRecurrentStep = GetNumParallelSequences() * rows / m_numRows;
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             // BUGBUG: the following will fail since outputSamplesInRecurrentStep will not match m_pMBLayout. Need to find out what this means (currently layout is constant throughout the graph), and implement it correctly.
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep, m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRows);
         }
@@ -1648,8 +1648,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
 
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange)
         {
-            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputValue = m_functionValues.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputValue = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputValue = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             EvaluateThisNodeS(sliceOutputValue, sliceInputValue, m_numRepeat);
         }
@@ -1675,8 +1675,8 @@ virtual const std::wstring OperationName() const { return TypeName(); }
             if (inputIndex != 0)
                 InvalidArgument("RowRepeat only has one input.");
 
-            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
-            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences()), m_pMBLayout);
+            Matrix<ElemType> sliceInputGrad = Inputs(0)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
+            Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
             ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_numRepeat);
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index e6f45eb57909..9cdf2f4accd4 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -225,7 +225,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 d = (int)functionValues.Mod((float)delayedIndex, (float)delayedActivation.GetNumCols());
             // this can point to the past activity of the previous minibatch
 
-            Matrix<ElemType> out = ValueSlice(frameRange, m_pMBLayout);
+            Matrix<ElemType> out = ValueSlice(frameRange);
             Matrix<ElemType> inp((DEVICEID_TYPE)functionValues.GetDeviceId());
 
             if (minibatchPackingFlags & SequenceStart_or_End)
@@ -606,18 +606,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int timeIdxInSeq = nT - GetNumParallelSequences(); timeIdxInSeq >= 0; timeIdxInSeq -= GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceState = DataSlice(m_State, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGi = DataSlice(m_Gi, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceGf = DataSlice(m_Gf, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceGo = DataSlice(m_Go, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceTanhObs = tanhObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceTanhState = DataSlice(tanhState, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceTanhObs = DataSlice(tanhObs, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
 
-                    Matrix<ElemType> error = GradientSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> error = GradientSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout));
 
                     Matrix<ElemType> grdToObsSlice(this->m_deviceId);
 
@@ -666,7 +666,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         grdToPrevState,
                         m_tempMatrix
                     );
-                    grdToObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences()), m_pMBLayout).SetValue(grdToObsSlice);
+                    DataSlice(grdToObs, frameRange/*TODO: delete this:*/.Check(timeIdxInSeq, GetNumParallelSequences(), m_pMBLayout)).SetValue(grdToObsSlice);
 
                     PrepareErrors(timeIdxInSeq, grdToPrevOutput, grdToPrevState, GetNumParallelSequences(), &m_pMBLayout->GetM());
                 }
@@ -997,16 +997,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t timeIdxInSeq = 0; timeIdxInSeq < nT; timeIdxInSeq += GetNumParallelSequences())
                 {
                     FrameRange frameRange(timeIdxInSeq, GetNumParallelSequences());
-                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceState = m_State.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceObs = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceOutput = ValueSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceState = DataSlice(m_State, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
 
-                    Matrix<ElemType> sliceGi = m_Gi.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceGf = m_Gf.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceGo = m_Go.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceGi = DataSlice(m_Gi, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceGf = DataSlice(m_Gf, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceGo = DataSlice(m_Go, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
 
-                    Matrix<ElemType> sliceTanhState = tanhState.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
-                    Matrix<ElemType> sliceTanhInput = tanhObs.FrameSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences()), m_pMBLayout);
+                    Matrix<ElemType> sliceTanhState = DataSlice(tanhState, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
+                    Matrix<ElemType> sliceTanhInput = DataSlice(tanhObs, frameRange/*TODO: delete this:*/.Check(frameRange.t(), GetNumParallelSequences(), m_pMBLayout));
 
                     PrepareHistory(timeIdxInSeq, mSlicePrevOutput, mSlicePrevState, FunctionValues(), m_State, m_PastOutput, m_PastState, GetNumParallelSequences(), m_DefaultState, &m_pMBLayout->GetM());
 
@@ -1101,8 +1101,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 // this is in the minibatch
                 FrameRange frameRange(timeIdxInSeq, nsamples);
-                Matrix<ElemType>::Multiply(output.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() - nsamples, nsamples), false, colSeg, false, newPrevOutput);
-                Matrix<ElemType>::Multiply(state.FrameSlice(frameRange/*TODO: delete the next two parameters*/, frameRange.t() - nsamples, nsamples), false, colSeg, false, newPrevState);
+                Matrix<ElemType>::Multiply(DataSlice(output, frameRange/*TODO: delete the next two parameters*/, frameRange.t() - nsamples, nsamples), false, colSeg, false, newPrevOutput);
+                Matrix<ElemType>::Multiply(DataSlice(state, frameRange/*TODO: delete the next two parameters*/, frameRange.t() - nsamples, nsamples), false, colSeg, false, newPrevState);
             }
 
             Base::SetToInitStateValueForResetSeg(sentenceBegin->ColumnSlice(utt_t, 1), nStream, initStateValue, newPrevState);
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index e15d802c9250..ab04316ca667 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -881,7 +881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 FrameRange frameRange(t, 1);    // TODO: change to frameRange over a whole MB with a sequence index. BUGBUG: below code will break until this is fixed
                 /// compute prb - 1 and prb
-                Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
                 size_t c_t = (size_t)lbl_t(1, 0);
                 size_t lft_bnd = (size_t)lbl_t(2, 0);
                 size_t rgt_bnd = (size_t)lbl_t(3, 0);
@@ -890,14 +890,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     continue;
 
                 Matrix<ElemType> input_weight_t = Inputs(2)->FunctionValues().ColumnSlice(lft_bnd, nbr_wrd);
-                Matrix<ElemType> obs = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> obs = Inputs(1)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
-                Matrix<ElemType> grd_to_cls_prob = m_clsLogSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                Matrix<ElemType> grd_to_cls_prob = DataSlice(m_clsLogSoftmax, frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
 
                 switch (inputIndex){
                 case 1:
                     /// gradient to input
-                    grd_t = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    grd_t = Inputs(1)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
                     ComputeInputPartialRight(input_weight_t, grd_t, grd_to_soft_max_input);
                     break;
                 case 2:
@@ -906,8 +906,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     ComputeInputPartialLeft(obs, grd_to_wgt_t, grd_to_soft_max_input);
                     break;
                 case 3:
-                    grd_t = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
-                    grd_t.SetValue(m_clsSoftmax.FrameSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout));
+                    grd_t = Inputs(3)->GradientSlice(frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
+                    grd_t.SetValue(DataSlice(m_clsSoftmax, frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout)));
                     ComputeCEPartialToSoftmaxInputs(grd_t, GradientValues(), c_t);
                     break;
                 default:
@@ -947,7 +947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     FrameRange frameRange(t, 1);
                     /// compute prb - 1 and prb
-                    Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1), m_pMBLayout);
+                    Matrix<ElemType> lbl_t = Inputs(0)->ValueSlice(frameRange/*TODO: delete this:*/.Check(t, 1, m_pMBLayout));
                     size_t y_t = (size_t)lbl_t(0, 0);
                     size_t lft_bnd = (size_t)lbl_t(2, 0);
                     size_t rgt_bnd = (size_t)lbl_t(3, 0);
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index e5a57c058552..75c4798eb181 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -601,7 +601,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //      FrameRange frameRange(t, 1);
     //    using a different #sequences. Solve by treating all frames as one sequence (in FrameRange)
     //  - ReshapeNode:
-    //      Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep), m_pMBLayout);
+    //      Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * outputSamplesInRecurrentStep, outputSamplesInRecurrentStep, m_pMBLayout));
     //    using a differeren #sequences. Find out what this really means.
     struct FrameRange
     {
@@ -623,9 +623,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); VerifyMBLayout(pMBLayout); return pMBLayout->GetNumParallelSequences(); }
         bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
 
-        const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols) const
+        const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr<MBLayout> & pMBLayout) const
         {
-            if (!IsAllFrames() && expectedStartColumn != StartColumn() || expectedNumCols != NumCols())
+            if (!IsAllFrames() && (samplesInRecurrentStep != pMBLayout->GetNumParallelSequences() || expectedStartColumn != StartColumn(pMBLayout) || expectedNumCols != NumCols(pMBLayout)))
                 LogicError("FrameSlice: FrameRange object gives different range than original explicit code. Logic is borked.");
             return *this;
         }

From 01a06b656b9d8797641be88a6a6fa62f9d646600 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Fri, 4 Sep 2015 10:57:12 -0700
Subject: [PATCH 30/44] Add FSAdaGrad

---
 MachineLearning/CNTKSGDLib/SGD.cpp | 5439 ++++++++++++++--------------
 Math/Math/GPUMatrix.cu             |   20 +
 Math/Math/GPUMatrix.h              |    2 +
 Math/Math/GPUMatrixCUDAKernels.cu  |   30 +
 Math/Math/Matrix.cpp               |   21 +
 Math/Math/Matrix.h                 |  277 +-
 6 files changed, 2935 insertions(+), 2854 deletions(-)

diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index 0c1ddeaf9d52..6bcaa2b9bd61 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -1,2716 +1,2723 @@
-// SGD.cpp -- implements SGD with all bells and whistles, parallelization, randomizatiom, etc.
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "Basics.h"
-#include "SGD.h"
-#include "AllReduceDistGradAggregator.h"
-
-#include <map>
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    using namespace std;
-
-    template<class ElemType>
-    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
-    {
-        int rank = myID;
-        int procs = numProcessor;
-
-        size_t rv = 0;
-        if (procs > 1)
-        {
-            for (auto it = mb.begin(); it != mb.end(); ++it)
-            {
-                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-                size_t nCols = mat.GetNumCols();
-                size_t col_start = (nCols * rank) / procs;
-                size_t col_end = (nCols * (rank + 1)) / procs;
-                if (col_end > nCols)
-                {
-                    // this shouldn't happen
-                    col_end = nCols;
-                }
-
-                if (col_end == col_start)
-                {
-                    MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
-                    mat.SetValue(tmp);
-                }
-                else
-                {
-                    MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
-                    mat.SetValue(tmp);
-                }
-
-                if (rv == 0)
-                {
-                    rv = mat.GetNumCols();
-                }
-                else
-                {
-                    if (rv != mat.GetNumCols())
-                    {
-                        throw std::logic_error("Uneven number of columns among inputs.");
-                    }
-                }
-            }
-        }
-    }
-
-    template<class ElemType> 
-    size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
-                                          int rank, int numprocs,                                    /* (input) rank info */
-                                          size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
-                                          MBLayoutPtr pMBLayout,                                     // gets filled in
-                                          IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
-    {
-        // For RNN, a input Matrix is organized in the following way: 
-        //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
-        //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
-        // N is the nSlice (input)
-        // The decimation here is to split each block to individual GPUs 
-        // So After decimation 
-        //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
-        // Each block now has nSlice/nProcs 
-        // 
-        // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
-            trainDataReader->CopyMBLayoutTo(pMBLayout); // fill this
-
-        size_t rv = 0;
-        size_t nOrigParallelUtts = nSlices;
-        static bool warned = false;
-        if (numprocs > 1)
-        {
-            // decide new parallel utterances 
-            size_t sent_start = 0;
-            size_t sent_end = 0;
-            if (nOrigParallelUtts % numprocs != 0)
-            {
-                if (!warned)
-                {
-                    /* give a warning of potential bandwidth wasting */
-                    fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
-                            (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
-                    warned = true;
-                }
-                if (rank == numprocs - 1)
-                {
-                    nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
-                    sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
-                    sent_end = nOrigParallelUtts;
-                }
-                else
-                {
-                    nSlices = nOrigParallelUtts / numprocs + 1;
-                    sent_start = nSlices * rank;
-                    sent_end = nSlices * (rank + 1);
-                    if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
-                }
-            }
-            else
-            {
-                nSlices = nOrigParallelUtts / numprocs;
-                sent_start = rank*nSlices;
-                sent_end = (rank + 1)*nSlices;
-                if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
-            }
-            // decimate data 
-            for (auto it = mb.begin(); it != mb.end(); ++it)
-            {
-                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
-                size_t nCols = mat.GetNumCols();
-
-                if (nCols % nOrigParallelUtts != 0)
-                {
-                    // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
-                    RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
-                }
-                size_t nBlocks = nCols / nOrigParallelUtts;
-                // for RNN, nBlocks is the size of truncated BPTT
-                if (sent_end == sent_start)
-                {
-                    // should never happen, print debug info
-                    RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
-                        (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
-                }
-
-                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
-
-                // do the column slice for each block 
-                for (size_t iblock = 0; iblock < nBlocks; iblock++)
-                {
-                    tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
-                        iblock*nSlices, nSlices);
-                }
-                mat.SetValue(tmp);
-
-                // assert the cols are even among nodes 
-                if (0 == rv)
-                {
-                    rv = mat.GetNumCols();
-                }
-                else
-                {
-                    if (rv != mat.GetNumCols())
-                        throw std::logic_error("Uneven number of columns among inputs.");
-                }
-            }
-            // revise sentence boundary and packing flags
-            // TODO: get rid of this explicit matrix, this can be done directly with MBLayout types.
-            size_t nMBSize = pMBLayout->GetSize();
-            Matrix<float> newBoundary(CPUDEVICE);
-            newBoundary.Resize(nSlices, nMBSize);
-            newBoundary.AssignRowSliceValuesOf(pMBLayout->GetM(), sent_start, nSlices);
-            fill(pMBLayout->GetV().begin(), pMBLayout->GetV().end(), MinibatchPackingFlags::None);
-            for (size_t nt = 0; nt < nMBSize; nt++)
-            {
-                for (size_t ns = 0; ns < nSlices; ns++)
-                {
-                    if (newBoundary(ns, nt) == ((int)MinibatchPackingFlags::SequenceStart))
-                        pMBLayout->GetV()[nt] |= MinibatchPackingFlags::SequenceStart;
-                    if (newBoundary(ns, nt) == ((int)MinibatchPackingFlags::SequenceEnd))
-                        pMBLayout->GetV()[nt] |= MinibatchPackingFlags::SequenceEnd;
-                }
-            }
-        }
-
-        return rv; 
-    }
-
-    static AdaptationRegType ParseAdaptationRegType(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if (s == L"" || s == L"none")
-            return AdaptationRegType::None;
-        else if (s == L"kl" || s == L"klreg")
-            return AdaptationRegType::KL;
-        else
-            throw std::invalid_argument("ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are (None | KL)");
-        }
-
-    static GradientsUpdateType ParseGradUpdateType(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
-            return GradientsUpdateType::None;
-        else if (s == L"adagrad")
-            return GradientsUpdateType::AdaGrad;
-        else if (s == L"rmsprop")
-            return GradientsUpdateType::RmsProp;
-        else
-            throw std::invalid_argument("ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are (None | AdaGrad | RmsProp )");
-    }
-
-    static ParallelizationMethod ParseParallelizationMethod(wstring s)
-    {
-        msra::strfun::tolower_ascii(s);
-        if ((s == L"") || (s == L"none"))
-            return ParallelizationMethod::None;
-        else if (s == L"dataparallelsgd")
-            return ParallelizationMethod::DataParallelSGD;
-        else if (s == L"modelaveragingsgd")
-            return ParallelizationMethod::ModelAveragingSGD;
-        else
-            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
-        }
-
-    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
-    {
-        // TODO: why allow so many variants?
-        msra::strfun::tolower_ascii(s);
-        if (s == L"false" || s == L"none")
-            return LearningRateSearchAlgorithm::None;
-        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
-            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
-        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
-            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
-        else
-            throw std::invalid_argument("autoAdjustLR: Invalid learning rate search type. Valid values are (None | SearchBeforeEpoch | AdjustAfterEpoch)");
-    }
-
-template<class ElemType>
-    SGD<ElemType>::SGD(const ConfigParameters& configSGD)
-    {
-        ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
-        m_needToNormalizeLRByParallUtterance = false;
-        m_needToNormalizeMomentumByParallUtterance = false;
-        floatargvector learningRatesPerMB = learningRatesPerMBStr;
-
-        ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
-        floatargvector learningRatesPerSample = learningRatesPerSampleStr;
-
-        std::string executionEngineValue = configSGD("executionEngine", "synchronous");
-
-        // AutoAdjust Parameters
-        ConfigParameters configAALR(configSGD("AutoAdjust", ""));
-        LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None"));
-        double reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0");
-        bool continueReduce = (bool) configAALR("continueReduce", "false");
-        size_t learnRateAdjustInterval = (size_t) configAALR("learnRateAdjustInterval", "1");
-        double learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618");
-        double increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");
-        double learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382");
-
-        // AutoAdjust Auto Adjust Minibatch Parameters
-        bool autoAdjustMinibatch = (bool) configAALR("autoAdjustMinibatch", "false");
-        size_t minibatchSizeTuningFrequency = configAALR("minibatchSizeTuningFrequency", "1");
-        size_t minibatchSizeTuningMax = configAALR("minibatchSizeTuningMax", "1048576");
-        size_t minibatchSearchCriterionErrorMargin = configAALR("minibatchSearchCriterionErrorMargin", "1");
-
-        // the number of minibatches used to search
-        // the learning rate. Its typically set to 10-20% of
-        // the total minibatches in an epoch.
-        ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500");
-        intargvector numMiniBatch4LRSearch = minibatch4LRSearch;
-
-        size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5");
-        size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1");
-        bool loadBestModel = configAALR("loadBestModel", "true");
-        bool useCVSetControlLRIfCVExists = configAALR("UseCVSetControlLRIfCVExists", "true");
-        bool useEvalCriterionControlLR = configAALR("UseEvalCriterionControlLR", "false");
-
-
-        ConfigArray minibatchSize = configSGD("minibatchSize", "256");
-        intargvector mbSize = minibatchSize;
-
-        // the number of samples in each epoch (0 means, use all the samples in each epoch).
-        size_t epochSize = configSGD("epochSize", "0");
-
-        // the total number of epochs to run.
-        size_t maxEpochs = configSGD("maxEpochs");
-
-        ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
-        floatargvector momentumPerMB = momentumPerMBStr;
-
-        ConfigArray momentumPerSampleStr = configSGD("momentumPerSample", "");
-        floatargvector momentumPerSample = momentumPerSampleStr;
-
-        wstring modelPath = configSGD("modelPath");
-        wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
-        wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
-
-        size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0");
-
-        int traceLevel = configSGD("traceLevel", "0");
-        size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10");
-        size_t numMBsToCUDAProfile = configSGD("numMBsToCUDAProfile", "0");
-
-        bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false");
-
-        bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true");
-        double clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF");
-
-        ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0");
-        floatargvector dropoutRates = dropoutRatesStr;
-
-        GradientUpdateInfo gUpdateInfo;
-        GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None"));
-        double gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0");
-        gUpdateInfo.mType = gradUpdateType;
-        gUpdateInfo.mGaussianNoiseInjectStd = (float) gaussianNoiseInjecStd;
-
-        // extract RMSProp parameters from config, if they exist. Default to reasonable values.
-        RMSPropInfo rpi;
-        rpi.dec = (double) configSGD("rms_wgt_dec", "0.75");
-        rpi.inc = (double) configSGD("rms_wgt_inc", "1.2");
-        rpi.min = (double) configSGD("rms_wgt_min", "0.1");
-        rpi.max = (double) configSGD("rms_wgt_max", "10.0");
-        rpi.gamma = (double) configSGD("rms_gamma", "0.99");
-
-        bool needAveMultiplier = (bool) configSGD("normWithAveMultiplier", "true");
-        double L2RegWeight = (double) configSGD("L2RegWeight", "0");
-        double L1RegWeight = (double) configSGD("L1RegWeight", "0");
-
-        /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of
-        /// useAdagrad=true
-        bool useAdagrad = configSGD("useAdagrad", "false");
-        if (useAdagrad)
-        {
-            gradUpdateType = GradientsUpdateType::AdaGrad;
-            gUpdateInfo.mType = gradUpdateType;
-        }
-
-        AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None"));
-        double adaptationRegWeight = configSGD("adaptationRegWeight", "0");
-
-        /// gradient check setup
-        bool doGradientCheck = configSGD("gradientcheck", "false");
-        double gradientCheckSigDigit = configSGD("sigFigs", "6");
-
-        if (doGradientCheck && sizeof(ElemType) != sizeof(double))
-            LogicError("Gradient check needs to use precision = double");
-        m_doUnitTest = configSGD("unittest", "false");
-
-        bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
-
-        bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
-
-        // Parallel training
-        m_parallelizationMethod = ParallelizationMethod::None;
-        m_distGradAgg = nullptr;
-        m_gradHeader = nullptr;
-        m_numGradientBits = 32;
-        m_zeroThresholdFor1Bit = true;
-        m_enableDistributedMBReading = false;
-        m_parallelizationStartEpochNum = 0;
-        m_nFramesBetweenMASync = 40000; // default 40k frames 
-
-        if ((g_mpi != nullptr) && configSGD.ExistsCurrent("ParallelTrain"))
-        {
-            ConfigParameters configParallelTrain(configSGD("ParallelTrain", ""));
-            m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain("parallelizationMethod", "None"));
-            m_parallelizationStartEpochNum = configParallelTrain("parallelizationStartEpoch", "1");
-            m_parallelizationStartEpochNum -= 1; // Epoch numbers internally are 0 based
-            m_enableDistributedMBReading = configParallelTrain("distributedMBReading", "false");
-
-            if (configParallelTrain.ExistsCurrent("DataParallelSGD"))
-            {
-                ConfigParameters configDataParallelSGD(configParallelTrain("DataParallelSGD", ""));
-                const char* defaultGradientBitsStr = (sizeof(ElemType) == sizeof(float)) ? "32" : "64";
-                m_numGradientBits = configDataParallelSGD("gradientBits", defaultGradientBitsStr);
-                m_zeroThresholdFor1Bit = configDataParallelSGD("useZeroThresholdFor1BitQuantization", "true");
-                if ((m_numGradientBits < 1) || (m_numGradientBits > (8 * sizeof(ElemType))))
-                {
-                    throw std::invalid_argument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
-                }
-            }
-
-            if (configParallelTrain.ExistsCurrent("ModelAveragingSGD") )
-            {
-                ConfigParameters configMASGD(configParallelTrain("ModelAveragingSGD", "")); 
-                m_nFramesBetweenMASync = configMASGD("SyncFrequencyInFrames", "40000"); 
-                m_iMASyncStatsTrace = configMASGD("MAPerfStats", "0");
-            }
-                
-        }
-
-        // TODO: the number of parameters of this function is waaay to little!
-        Init(learningRatesPerMB,
-             learningRatesPerSample,
-             mbSize,
-             epochSize,
-             maxEpochs,
-             modelPath,
-             momentumPerMB,
-             momentumPerSample,
-             gradientClippingWithTruncation,
-             clippingThresholdPerSample,
-             autoAdjustLRType,
-             increaseLearnRateIfImproveMoreThan,
-             learnRateIncreaseFactor,
-             reduceLearnRateIfImproveLessThan,
-             continueReduce,
-             learnRateDecreaseFactor,
-             dropoutRates,
-             loadBestModel,
-             numMiniBatch4LRSearch,
-             numPrevLearnRates,
-             numBestSearchEpoch,
-             traceLevel,
-             numMBsToShowResult,
-             numMBsToCUDAProfile,
-             maxTempMemSizeInSamplesForCNN,
-             gUpdateInfo,
-             keepCheckPointFiles,
-             adaptationRegType,
-             adaptationRegWeight,
-             trainCriterionNodeName,
-             evalCriterionNodeName,
-             doGradientCheck,
-             gradientCheckSigDigit,
-             validateAfterModelReloading,
-             rpi,
-             learnRateAdjustInterval,
-             UsingAllDataForPreComputedNode,
-             needAveMultiplier,
-             L2RegWeight,
-             L1RegWeight,
-             autoAdjustMinibatch,
-             minibatchSizeTuningFrequency,
-             minibatchSizeTuningMax,
-             useCVSetControlLRIfCVExists,
-             useEvalCriterionControlLR,
-             minibatchSearchCriterionErrorMargin);
-    }
-
-    //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
-    template<class ElemType>
-    void SGD<ElemType>::Init(const floatargvector& learningRatesPerMB,
-              const floatargvector& learningRatesPerSample,
-              const intargvector& mbSize,
-              const size_t epochSize,
-              const size_t maxEpochs,
-              const wstring& modelPath,
-              const floatargvector& momentumPerMB,
-              const floatargvector& momentumPerSample,
-              const bool gradientClippingWithTruncation,
-              const double clippingThresholdPerSample,
-              const LearningRateSearchAlgorithm autoLearnRateSearchType,
-              const double increaseLearnRateIfImproveMoreThan,
-              const double learnRateIncreaseFactor,
-              const double reduceLearnRateIfImproveLessThan,
-              const bool continueReduce,
-              const double learnRateDecreaseFactor,
-              floatargvector dropoutRates,
-              const bool loadBestModel,
-              const intargvector& numMiniBatch4LRSearch,
-              const size_t numPrevLearnRates,
-              const size_t numBestSearchEpoch,
-              const int traceLevel,
-              const size_t numMBsToShowResult,
-              const size_t numMBsToCUDAProfile,
-              const size_t maxTempMemSizeInSamplesForCNN,
-              const GradientUpdateInfo gradUpdateType,
-              const bool keepCheckPointFiles,
-              const AdaptationRegType adaptationRegType,
-              const double adaptationRegWeight,
-              const wstring trainCriterionNodeName,
-              const wstring evalCriterionNodeName,
-              const bool doGradientCheck,
-              const double gradientCheckSigDigit,
-              const bool validateAfterModelReloading,
-              RMSPropInfo rpi,
-              size_t learnRateAdjustInterval,
-              const bool UsingAllDataForPreComputed,
-              const bool needAveMultiplier,
-              const double L2RegWeight,
-              const double L1RegWeight,
-              const bool autoAdjustMinibatch,
-              const size_t minibatchSizeTuningFrequency,
-              const size_t minibatchSizeTuningMax,
-              const bool useCVSetControlLRIfCVExists,
-              const bool useEvalCriterionControlLR,
-              const size_t minibatchSearchCriterionErrorMargin)
-    {
-        m_numPrevLearnRates = numPrevLearnRates;
-        m_prevChosenMinibatchSize = 0;
-        m_autoAdjustMinibatch = autoAdjustMinibatch;
-        m_minibatchSizeTuningMax = minibatchSizeTuningMax;
-        m_minibatchSizeTuningFrequency = minibatchSizeTuningFrequency;
-        m_minibatchSearchCriterionErrorMargin = minibatchSearchCriterionErrorMargin;
-
-        m_mbSize = mbSize;
-
-        // the number of samples in each epoch (0 means, use all the samples in each epoch).
-        m_epochSize = epochSize;
-        if (m_epochSize == 0)
-        {
-            m_epochSize = requestDataSize;
-        }
-
-        // the total number of epochs to run.
-        m_maxEpochs = maxEpochs;
-
-        m_gradientClippingWithTruncation = gradientClippingWithTruncation;
-        m_modelPath = modelPath;
-        m_autoLearnRateSearchType = autoLearnRateSearchType;
-        m_traceLevel = traceLevel;
-        m_loadBestModel = loadBestModel;
-        m_increaseLearnRateIfImproveMoreThan = increaseLearnRateIfImproveMoreThan;
-        m_learnRateIncreaseFactor = learnRateIncreaseFactor;
-        m_reduceLearnRateIfImproveLessThan = reduceLearnRateIfImproveLessThan;
-        m_continueReduce = continueReduce;
-
-        //minimum interval is 1 epoch
-        m_learnRateAdjustInterval = max((size_t) 1, learnRateAdjustInterval);
-
-        m_learnRateDecreaseFactor = learnRateDecreaseFactor;
-        m_clippingThresholdPerSample = abs(clippingThresholdPerSample);
-        m_numMiniBatch4LRSearch = numMiniBatch4LRSearch;
-        m_dropoutRates = dropoutRates;
-        m_numMBsToShowResult = int(numMBsToShowResult);
-        m_numMBsToCUDAProfile = int(numMBsToCUDAProfile);
-        m_numBestSearchEpoch = numBestSearchEpoch;
-        m_maxTempMemSizeInSamplesForCNN = maxTempMemSizeInSamplesForCNN;
-        m_gradType = gradUpdateType;
-        m_rpi = rpi;
-        m_keepCheckPointFiles = keepCheckPointFiles;
-
-        m_adaptationRegType = adaptationRegType;
-        m_adaptationRegWeight = adaptationRegWeight;
-
-        m_trainCriterionNodeName = trainCriterionNodeName;
-        m_evalCriterionNodeName = evalCriterionNodeName;
-        m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;
-
-        m_needAveMultiplier = needAveMultiplier;
-        m_L2RegWeight = L2RegWeight;
-        m_L1RegWeight = L1RegWeight;
-
-        for (size_t i = 0; i < m_mbSize.size(); i++)
-        {
-            if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
-            {
-                throw std::invalid_argument("epoch size must be larger than mbsize.");
-            }
-        }
-
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None &&
-            (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
-        {
-            throw std::invalid_argument("If autoLearnRateSearchType is false "
-                                        "you must specify the learningRatesPerSample "
-                                        "or learningRatesPerMB parameter.");
-        }
-
-        if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
-        {
-            throw std::invalid_argument("You specified both learningRatesPerSample "
-                                        "and learningRatesPerMB. Please comment "
-                                        "out one of them.");
-        }
-        else if (learningRatesPerSample.size() > 0)
-        {
-            m_learningRatesPerSample = learningRatesPerSample;
-        }
-        else if (learningRatesPerMB.size() > 0)
-        {
-            int LRSize = (int) max(learningRatesPerMB.size(), m_mbSize.size());
-            m_learningRatesPerSample.resize(LRSize);
-            for (int i = 0; i < LRSize; i++)
-            {
-                m_learningRatesPerSample[i] = learningRatesPerMB[i] / m_mbSize[i];
-            }
-            m_needToNormalizeLRByParallUtterance = true;
-        }
-
-        if (momentumPerSample.size() > 0 && momentumPerMB.size() > 0)
-        {
-            throw std::invalid_argument("You specified both momentumPerSample "
-                                        "and momentumPerMB. Please comment "
-                                        "out one of them.");
-        }
-        else if (momentumPerSample.size() > 0)
-        {
-            m_momentumPerSample = momentumPerSample;
-            int momentumVectorSize = m_momentumPerSample.size();
-            for (int i = 0; i < momentumVectorSize; i++)
-            {
-                if ((m_momentumPerSample[i] >= 1) || (m_momentumPerSample[i] < 0))
-                {
-                    throw std::invalid_argument("momentumPerSample must be in [0, 1).");
-                }
-            }
-        }
-        else if (momentumPerMB.size() > 0)
-        {
-            int momentumVectorSize = (int)max(momentumPerMB.size(), m_mbSize.size());
-            m_momentumPerSample.resize(momentumVectorSize);
-            for (int i = 0; i < momentumVectorSize; i++)
-            {
-                if ((momentumPerMB[i] >= 1) || (momentumPerMB[i] < 0))
-                    InvalidArgument("momentumPerMB must be in [0, 1).");
-                m_momentumPerSample[i] = (float)pow(momentumPerMB[i], 1.0 / m_mbSize[i]); 
-            }
-
-            m_needToNormalizeMomentumByParallUtterance = true;
-        }
-        else
-        {
-            int momentumVectorSize = m_mbSize.size();
-            m_momentumPerSample.resize(momentumVectorSize);
-            for (int i = 0; i < momentumVectorSize; i++)
-                m_momentumPerSample[i] = (float)pow(0.9f, 1.0 / m_mbSize[i]);
-            }
-
-        if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor < 1)
-            InvalidArgument("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
-
-        for (size_t i = 0; i < m_dropoutRates.size(); i++)
-            if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
-                InvalidArgument("dropoutRate must be >= 0 and < 1.");
-
-        if (m_adaptationRegWeight > 1 || m_adaptationRegWeight < 0)
-            InvalidArgument("adaptationRegWeight must be in [0 1]");
-
-        m_minLearnRate = 1e-9f;
-
-        m_needAdaptRegularization = false;
-
-        m_doGradientCheck = doGradientCheck;
-        m_gradientCheckSigDigit = gradientCheckSigDigit;
-        m_validateAfterModelReloading = validateAfterModelReloading;
-
-        m_useCVSetControlLRIfCVExists = useCVSetControlLRIfCVExists;
-        m_useEvalCriterionControlLR = useEvalCriterionControlLR;
-
-        msra::files::make_intermediate_dirs(m_modelPath);
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
-               IDataReader<ElemType>* trainSetDataReader,
-               IDataReader<ElemType>* validationSetDataReader,
-               const DEVICEID_TYPE deviceID, const bool makeMode)
-    {
-        if (origModelFileName == L"" || trainSetDataReader == nullptr)
-            InvalidArgument("origModel and trainSetDataReader should not be null.");
-
-        int startEpoch = DetermineStartEpoch(makeMode);
-        if (startEpoch == m_maxEpochs)
-        {
-            fprintf(stderr, "Final model exists. No further training is necessary.\n");
-            return;
-        }
-
-        ComputationNetwork net(deviceID);
-        if (startEpoch >= 0)
-        {
-            wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-            net.LoadFromFile<ElemType>(modelFileName);
-        }
-        else
-        {
-            fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
-            net.LoadFromFile<ElemType>(origModelFileName);
-        }
-
-        startEpoch = max(startEpoch, 0);
-
-        ComputationNetwork refNet(deviceID);
-        m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
-        if (m_needAdaptRegularization)
-        {
-            fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
-            refNet.LoadFromFile<ElemType>(origModelFileName);
-        }
-
-        ComputationNodeBasePtr refNode;
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
-        {
-            fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
-            if (refNodeName == L"")
-                InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
-            refNode = refNet.GetNodeFromName(refNodeName);
-        }
-
-        TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::SequenceTrain(IComputationNetBuilder<ElemType>* netBuilder, wstring origModelFileName,
-                       IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader,
-                       const DEVICEID_TYPE deviceID, const bool makeMode)
-    {
-        if (netBuilder == nullptr || origModelFileName == L"" || trainSetDataReader == nullptr)
-            InvalidArgument("netBuilder, origModel and trainSetDataReader should not be null.");
-
-        int startEpoch = DetermineStartEpoch(makeMode);
-        if (startEpoch == m_maxEpochs)
-        {
-            fprintf(stderr, "Final model exists. No further training is necessary.\n");
-            return;
-        }
-
-        // Initializes the model from original model.
-        ComputationNetwork origNet(deviceID);
-        ComputationNetwork* sequenceNet = 
-            (startEpoch < 0) ? netBuilder->BuildNetworkFromDescription() : &origNet;
-        std::vector<ComputationNodeBasePtr> addedFeatureNodes;
-        std::vector<ComputationNodeBasePtr> replacedCriterionNodes;
-        if (startEpoch < 0)
-        {
-            // Loads models.
-            origNet.LoadFromFile<ElemType>(origModelFileName);
-
-            // Processes feature nodes.
-            std::vector<ComputationNodeBasePtr> & sequenceFeatureNodes = sequenceNet->FeatureNodes();
-            for (size_t i = 0; i < sequenceFeatureNodes.size(); ++i)
-            {
-                if (!origNet.NodeNameExist(sequenceFeatureNodes[i]->NodeName()))
-                {
-                    addedFeatureNodes.push_back(sequenceFeatureNodes[i]);
-                    origNet.AddFeatureNode(sequenceFeatureNodes[i]);
-                }
-            }
-
-            // Processes criterion nodes.
-            auto & origCriterionNodes = GetTrainCriterionNodes(origNet);
-            auto & sequenceCriterionNodes = GetTrainCriterionNodes(*sequenceNet);
-            if (origCriterionNodes.size() == 0 || sequenceCriterionNodes.size() == 0)
-            {
-                throw std::runtime_error("Training criterion node does not exist.");
-            }
-            replacedCriterionNodes.push_back(origCriterionNodes[0]);
-            origNet.ReplaceFinalCriterionNode(origCriterionNodes[0]->NodeName(), sequenceCriterionNodes[0]);
-            origNet.ResetEvalTimeStamp();
-        }
-
-        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-        if (startEpoch >= 0)
-            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-        else
-            fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
-        ComputationNetwork *net = (startEpoch < 0) ? &origNet : netBuilder->LoadNetworkFromFile(modelFileName);
-
-        startEpoch = max(startEpoch, 0);
-
-        TrainOrAdaptModel(startEpoch, *net, *net, nullptr, trainSetDataReader, validationSetDataReader);
-
-        // Handles deletions carefully here.
-        if (startEpoch < 0)
-        {
-            for (size_t i = 0; i < addedFeatureNodes.size(); ++i)
-                origNet.RemoveFeatureNode(addedFeatureNodes[i]);
-            auto & origCriterionNodes = GetTrainCriterionNodes(origNet);
-            origNet.ReplaceFinalCriterionNode(origCriterionNodes[0]->NodeName(), replacedCriterionNodes[0]);
-        }
-    }
-
-    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
-    {
-        return pow(momentumPerSample, minibatchSize);
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::Train(IComputationNetBuilder<ElemType>* netBuilder,
-               IDataReader<ElemType>* trainSetDataReader,
-               IDataReader<ElemType>* validationSetDataReader,
-               const bool makeMode)
-    {
-        if (netBuilder == nullptr || trainSetDataReader == nullptr)
-            InvalidArgument("netBuilder and trainSetDataReader should not be null.\n");
-        int startEpoch = DetermineStartEpoch(makeMode);
-        if (startEpoch == m_maxEpochs)
-        {
-            fprintf(stderr, "Final model exists. No further training is necessary.\n");
-            return;
-        }
-
-        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-        if (startEpoch >= 0)
-            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
-
-        ComputationNetwork* net = startEpoch < 0 ? netBuilder->BuildNetworkFromDescription() :
-                                                             netBuilder->LoadNetworkFromFile(modelFileName);
-        // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
-        // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters.
-
-        /*  if (m_doUnitTest)
-        {
-            if (net.UnitTest() == false)
-                LogicError("unit test on decoder network not passed");
-
-            return;
-        }*/
-
-        startEpoch = max(startEpoch, 0);
-        m_needAdaptRegularization = false;
-
-        TrainOrAdaptModel(startEpoch, *net, *net, nullptr, trainSetDataReader, validationSetDataReader);
-    }
-
-// protected:
-
-    // Get{Train,Eval}CriterionNodes() return a reference that is, unfortunately, dependent on the network.
-    // So we hold those inside here. Not very nice. Also not thread-safe. This may go away once we fix sequence-to-sequence models properly.
-    static map<ComputationNetwork*, vector<ComputationNodeBasePtr>> tmpCriterionNodeSets;
-    // TODO: test this, then remove this comment
-
-    template<class ElemType>
-    std::vector<ComputationNodeBasePtr> & SGD<ElemType>::GetTrainCriterionNodes(ComputationNetwork& net)
-    {
-        fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str());
-        if (!m_trainCriterionNodeName.empty())
-        {
-            tmpCriterionNodeSets[&net] = net.CriterionNodesFrom(m_trainCriterionNodeName);
-            return tmpCriterionNodeSets[&net];
-        }
-        else
-            return net.FinalCriterionNodes();
-    }
-
-    template<class ElemType>
-    std::vector<ComputationNodeBasePtr> & SGD<ElemType>::GetEvalCriterionNodes(ComputationNetwork& net)
-    {
-        fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str());
-        if (!m_evalCriterionNodeName.empty())
-        {
-            tmpCriterionNodeSets[&net] = net.CriterionNodesFrom(m_evalCriterionNodeName);
-            return tmpCriterionNodeSets[&net];
-        }
-        else
-            return net.EvaluationNodes();
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetwork& net,
-                           ComputationNetwork& refNet,
-                           ComputationNodeBasePtr refNode,
-                           IDataReader<ElemType>* trainSetDataReader,
-                           IDataReader<ElemType>* validationSetDataReader)
-    {
-        auto & featureNodes = net.FeatureNodes();
-        auto & labelNodes = net.LabelNodes();
-        auto & criterionNodes = GetTrainCriterionNodes(net);
-        auto & evaluationNodes = GetEvalCriterionNodes(net);
-
-        std::map<std::wstring, Matrix<ElemType>*>* inputMatrices = new std::map<std::wstring, Matrix<ElemType>*>();
-        for (size_t i = 0; i < featureNodes.size(); i++)
-        {
-            // TODO: instead, remember the nodes directly, to be able to handle both float and double nodes; current version will crash for mixed networks
-            (*inputMatrices)[featureNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(featureNodes[i])->FunctionValues();
-        }
-
-        for (size_t i = 0; i < labelNodes.size(); i++)
-        {
-            (*inputMatrices)[labelNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[i])->FunctionValues();
-        }
-
-        // used for KLD regularized adaptation. For all other adaptation techniques
-        // use MEL to edit the model and using normal training algorithm
-        std::vector<ComputationNodeBasePtr> refFeatureNodes;
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
-        {
-            refFeatureNodes.resize(featureNodes.size());
-            for (size_t i = 0; i < featureNodes.size(); i++)
-            {
-                //we need to keep this info to handle deletion
-                refFeatureNodes[i] = refNet.GetNodeFromName(featureNodes[i]->NodeName());
-                refNet.ChangeNode(featureNodes[i]->NodeName(), featureNodes[i]);
-            }
-
-            refNet.RebuildNetwork(refNode);
-        }
-
-        //initializing weights and gradient holder
-        //only one criterion so far TODO: support multiple ones?
-        auto & learnableNodes = net.LearnableNodes(criterionNodes[0]);
-        std::list<Matrix<ElemType>> smoothedGradients;
-
-        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-        {
-            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-            smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(),
-                                                         node->FunctionValues().GetNumCols(),
-                                                         net.GetDeviceId()));
-        }
-
-        double epochCriterion, avgCriterion, prevCriterion, lrControlCriterion;
-        lrControlCriterion = epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();
-        size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
-
-        std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
-
-        std::vector<wstring> evalNodeNames;
-        for (size_t i = 0; i < evaluationNodes.size(); i++)
-            evalNodeNames.push_back(evaluationNodes[i]->NodeName());
-
-        size_t totalSamplesSeen = 0;
-        double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
-
-        double learningRateAdjustmentFactor = 1.0f;
-        vector<double> prevLearnRates;
-        prevLearnRates.resize(m_numPrevLearnRates);
-        for (int i = 0; i < m_numPrevLearnRates; i++)
-             prevLearnRates[i] = -1.0;
-
-        //precompute mean and invStdDev nodes and save initial model
-        if (PreCompute(net, trainSetDataReader, featureNodes, labelNodes, inputMatrices) || startEpoch == 0)
-        {
-            // Synchronize all ranks before writing the model to ensure that 
-            // everyone is done loading the model
-            if (g_mpi != nullptr)
-                g_mpi->WaitAll();
-
-            net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
-        }
-
-        // first, we need to normalize the effect of nbruttsineachrecurrentiter
-        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeLRByParallUtterance)
-        {
-            for (auto& x : m_learningRatesPerSample)
-                x /= (float)trainSetDataReader->GetNumParallelSequences();
-        }
-        
-        // first, we need to normalize the effect of nbruttsineachrecurrentiter for momemtum
-        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
-        {
-            for (auto& x : m_momentumPerSample)
-                x = (float)pow(x, 1.0 / trainSetDataReader->GetNumParallelSequences());
-        }
-
-        bool learnRateInitialized = false;
-        if (startEpoch > 0)
-        {
-            learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
-                                                      /*out*/ totalSamplesSeen,
-                                                      /*out*/ learnRatePerSample,
-                                                      smoothedGradients,
-                                                      /*out*/ prevCriterion,
-                                                      /*out*/ m_prevChosenMinibatchSize);
-            if (learnRateInitialized)
-                prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
-            }
-
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
-            !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
-        {
-            InvalidArgument(
-                "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, "
-                "or an explicit learning rate must be specified in config for the starting epoch.");
-        }
-
-        unsigned long dropOutSeed = 1;
-        double prevDropoutRate = 0;
-
-        bool learnRateReduced = false;
-
-        ComputationNetwork::SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
-            ComputationNetwork::SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
-
-        // --- MAIN EPOCH LOOP
-
-        for (int i = startEpoch; i < (int)m_maxEpochs; i++)
-        {
-            // Synchronize all ranks before proceeding to ensure that 
-            // rank 0 has finished writing the previous model file
-            if (g_mpi != nullptr)
-                g_mpi->WaitAll();
-
-            Timer timer;
-            timer.Start();
-
-            // set dropout rate
-            ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-
-            // learning rate adjustment
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None ||
-                (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
-            {
-                learnRatePerSample = m_learningRatesPerSample[i];
-            }
-            else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
-            {
-                double largestPrevLearnRatePerSample = prevLearnRates[0];
-                for (int j = 1; j < m_numPrevLearnRates; j++)
-                    largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
-
-                // return a reasonable learning rate based on the initial minibatchSize
-                double newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample,
-                                                                           trainSetDataReader, featureNodes, labelNodes,
-                                                                           criterionNodes, evaluationNodes, inputMatrices,
-                                                                           learnableNodes, smoothedGradients,
-                                                                           learnRateInitialized, largestPrevLearnRatePerSample);
-                learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample;
-                learnRatePerSample = newLearningRatePerSample;
-
-                // save per sample learn rate to support changeable minibatchSize
-                prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;
-            }
-
-            learnRateInitialized = true;
-
-            if (learnRatePerSample < m_minLearnRate)
-            {
-                fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n",
-                        i + 1, learnRatePerSample, m_minLearnRate);
-                if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
-                    net.SaveToFile(m_modelPath);
-                break;
-            }
-
-            size_t chosenMinibatchSize;
-            size_t actualMinibatchSize;
-
-            // Through the command line or config file the user can set minibatch sizes on a per epoch
-            // basis for a set number of epochs.  For epochs after that point, m_mbSize.size(), either
-            // we just keep using
-            // the last minibatch size, or we use tuning to try and find a better one.
-            if (m_autoAdjustMinibatch && i >= m_mbSize.size())
-            {
-                size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i];
-                if (m_epochSize != requestDataSize)
-                {
-                    // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
-                    numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
-                }
-
-                // Use tuning to try and find a better minibatch size
-                chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i,
-                                                              numFramesToUseInSearch,
-                                                              trainSetDataReader, learnRatePerSample,
-                                                              m_mbSize[i], featureNodes, labelNodes,
-                                                              criterionNodes, evaluationNodes,
-                                                              inputMatrices, learnableNodes,
-                                                              smoothedGradients, learningRateAdjustmentFactor);
-                m_prevChosenMinibatchSize = chosenMinibatchSize;
-            }
-            else
-            {
-                // use the explicitly set minibatch size
-                chosenMinibatchSize = m_mbSize[i];
-            }
-            
-            actualMinibatchSize = chosenMinibatchSize;
-            if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
-                actualMinibatchSize = chosenMinibatchSize * trainSetDataReader->GetNumParallelSequences();
-
-            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
-                    i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], actualMinibatchSize));
-
-            TrainOneEpoch(net,
-                          refNet, 
-                          refNode, 
-                          i, 
-                          m_epochSize,
-                          trainSetDataReader, 
-                          learnRatePerSample, 
-                          chosenMinibatchSize, 
-                          featureNodes,
-                          labelNodes, 
-                          criterionNodes, 
-                          evaluationNodes,
-                          inputMatrices, 
-                          learnableNodes, smoothedGradients,
-                          epochCriterion, epochEvalErrors, totalSamplesSeen);
-
-            timer.Stop();
-            double epochTime = timer.ElapsedSeconds();
-
-            if (m_useEvalCriterionControlLR)
-                lrControlCriterion = epochEvalErrors[0];
-            else
-                lrControlCriterion = epochCriterion;
-
-            fprintf(stderr,
-                    "Finished Epoch[%d]: [Training Set] TrainLossPerSample = %.8g; ",
-                    i + 1, epochCriterion);
-            if (epochEvalErrors.size() == 1)
-            {
-                fprintf(stderr,
-                        "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n",
-                        epochEvalErrors[0], learnRatePerSample, epochTime);
-            }
-            else
-            {
-                fprintf(stderr, "EvalErrPerSample ");
-                for (size_t j = 0; j < epochEvalErrors.size(); j++)
-                    fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
-
-                fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n",
-                        learnRatePerSample, epochTime);
-
-                fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n",
-                                i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion);
-
-                for (size_t j = 0; j < epochEvalErrors.size(); j++)
-                {
-                    fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n",
-                            i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
-                }
-            }
-
-            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
-            {
-                if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
-                {
-                    SimpleEvaluator<ElemType> evalforvalidation(net);
-                    vector<wstring> cvSetTrainAndEvalNodes;
-                    cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
-                    cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
-
-                    vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
-                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n",
-                            i + 1, vScore[0], vScore[1]);
-
-                    if (m_useCVSetControlLRIfCVExists)
-                    {
-                        if (m_useEvalCriterionControlLR)
-                            lrControlCriterion = vScore[1];
-                        else
-                            lrControlCriterion = vScore[0]; //the first one is the training criterion.
-                        }
-                    }
-                }
-
-            // broadcast epochCriterion to make sure each processor will have the same learning rate schedule
-            if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
-                g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
-
-            bool loadedPrevModel = false;
-            size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
-            if (avgCriterion == std::numeric_limits<double>::infinity())
-            {
-                avgCriterion = lrControlCriterion;
-            }
-            else
-            {
-                avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) *
-                    avgCriterion + lrControlCriterion) /
-                    (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
-            }
-
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
-                m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
-            {
-                if (std::isnan(avgCriterion) || (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity()))
-                {
-                    if (m_loadBestModel)
-                    {
-                        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i - 1),
-                                                              m_validateAfterModelReloading);
-                        net.ResetEvalTimeStamp();
-                        LoadCheckPointInfo(i - 1,
-                                           /*out*/ totalSamplesSeen,
-                                           /*out*/ learnRatePerSample,
-                                           smoothedGradients,
-                                           /*out*/ prevCriterion,
-                                           /*out*/ m_prevChosenMinibatchSize);
-                        fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
-                        loadedPrevModel = true;
-                    }
-                }
-
-                if (m_continueReduce)
-                {
-                    if (std::isnan(avgCriterion) || 
-                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
-                        prevCriterion != std::numeric_limits<double>::infinity()))
-                    {
-                        if (learnRateReduced == false)
-                            learnRateReduced = true;
-                        else
-                        {
-                            net.SaveToFile(GetModelNameForEpoch(i, true));
-
-                            fprintf(stderr, "Finished training and saved final model\n\n");
-                            break;
-                        }
-                    }
-
-                    if (learnRateReduced)
-                    {
-                        learnRatePerSample *= m_learnRateDecreaseFactor;
-                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
-                    }
-                }
-                else
-                {
-                    if (std::isnan(avgCriterion) || 
-                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
-                        prevCriterion != std::numeric_limits<double>::infinity()))
-                    {
-
-                        learnRatePerSample *= m_learnRateDecreaseFactor;
-                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
-                    }
-                    else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
-                             prevCriterion != std::numeric_limits<double>::infinity())
-                    {
-                        learnRatePerSample *= m_learnRateIncreaseFactor;
-                        fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
-                    }
-                }
-            }
-            else
-            {
-                if (std::isnan(avgCriterion))
-                    RuntimeError("The training criterion is not a number (NAN). Stop\n");
-            }
-
-            // not loading previous values then set them
-            if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
-            {
-                prevCriterion = avgCriterion;
-                epochsNotCountedInAvgCriterion = 0;
-            }
-
-            // Synchronize all ranks before proceeding to ensure that 
-            // nobody tries reading the checkpoint file at the same time
-            // as rank 0 deleting it below
-            if (g_mpi != nullptr)
-                g_mpi->WaitAll();
-
-            // persist model and check-point info
-            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
-            {
-                net.SaveToFile(GetModelNameForEpoch(i));
-                SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
-                if (!m_keepCheckPointFiles)
-                {
-                    // delete previous checkpoint file to save space
-                    _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
-                }
-            }
-
-            if (learnRatePerSample < 1e-12)
-            {
-                fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
-                        learnRatePerSample);
-            }
-        }
-
-        // --- END OF MAIN EPOCH LOOP
-
-        // since we linked feature nodes. we need to remove it from the deletion
-        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
-        {
-            for (size_t i = 0; i < refFeatureNodes.size(); i++)
-            {
-                // note we need to handle deletion carefully
-                refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]);
-            }
-        }
-
-        delete inputMatrices;
-    }
-
-// protected:
-
-    // return true if precomputation is executed.
-    template<class ElemType>
-    bool SGD<ElemType>::PreCompute(ComputationNetwork& net,
-                    IDataReader<ElemType>* trainSetDataReader,
-                    std::vector<ComputationNodeBasePtr> & featureNodes,
-                    std::vector<ComputationNodeBasePtr> & labelNodes,
-                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
-    {
-        std::list<ComputationNodeBasePtr> nodes = net.GetNodesRequiringPreComputation();
-
-        if (nodes.size() == 0)
-        {
-            fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
-            return false;
-        }
-
-        fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size());
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        {
-            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
-            fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
-        }
-
-        //compute
-        //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
-        // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
-        // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's,
-        // so we need to use all the data to do precomputing
-        if (m_useAllDataForPreComputedNode)
-        {
-            // using all the data
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
-        }
-        else
-        {
-            // using all the data
-            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
-        }
-
-        while (trainSetDataReader->GetMinibatch(*inputMatrices))
-        {
-            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
-            ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
-
-            net.SetActualMiniBatchSizeFromFeatures();
-            trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
-            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
-
-            // TODO: Exactly this loop should be INSIDE ComputationNetwork--pass the nodes array instead!
-            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-                net.Evaluate(*nodeIter);
-        }
-
-        // mark done
-        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-        {
-            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
-            node->MarkComputed(true);
-        }
-
-        return true;
-    }
-
-    // return a reasonable initial learning rate based on the initial mbsize
-    template<class ElemType>
-    double SGD<ElemType>::SearchForBestLearnRate(ComputationNetwork& net,
-                                    ComputationNetwork& refNet,
-                                    const ComputationNodeBasePtr refNode, const int epochNumber,
-                                  const double curLearnRate,
-                                    IDataReader<ElemType>* trainSetDataReader,
-                                    const std::vector<ComputationNodeBasePtr> & featureNodes,
-                                    const std::vector<ComputationNodeBasePtr> & labelNodes,
-                                    const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                                    const std::vector<ComputationNodeBasePtr> & evaluationNodes,
-                                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                                    const std::list<ComputationNodeBasePtr> & learnableNodes,
-                                    std::list<Matrix<ElemType>>& smoothedGradients,
-                                    const bool learnRateInitialized,
-                                  const double largestPrevLearnRatePerSample)
-    {
-        double epochCriterion = std::numeric_limits<double>::infinity();
-        double prevCriterion = std::numeric_limits<double>::infinity();
-        vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
-
-        size_t totalSamplesSeen = 0;
-        double bestLearnRatePerSample = curLearnRate;
-
-        size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
-        if (m_epochSize != requestDataSize)
-        {
-            // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
-            numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
-        }
-
-        double baseCriterion;
-
-        double minLearnRate = m_minLearnRate * 0.3f;
-        double learnRatePerSample = 1.0f / 8.0f / 0.618f / sqrt((double)m_mbSize[epochNumber]);
-
-        if (learnRateInitialized && largestPrevLearnRatePerSample > 0)
-        {
-            //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
-            learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;
-        }
-
-        int baseModelEpoch = epochNumber - 1;
-        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
-        net.ResetEvalTimeStamp();
-
-        double learnRate = learnRatePerSample;
-        size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch,
-                           /*out*/ totalSamplesSeen,
-                           /*out*/ learnRate,
-                           smoothedGradients,
-                           /*out*/ prevCriterion,
-                           /*out*/ dummyMinibatchSize);
-
-        // if model is not changed this is what we will get
-        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                        numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber],
-                                        featureNodes, labelNodes,
-                                        criterionNodes, evaluationNodes,
-                                        inputMatrices, learnableNodes,
-                                        smoothedGradients, /*out*/ baseCriterion,
-                                        /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                                        "BaseAdaptiveLearnRateSearch:");
-
-        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
-        {
-            if (prevCriterion == std::numeric_limits<double>::infinity())
-                prevCriterion = baseCriterion;
-
-            double ratio = 0.3;
-
-            if (m_epochSize != requestDataSize)
-                ratio = pow(((double)numFramesToUseInSearch) / m_epochSize, 1.0f / 2);
-
-            baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
-        }
-
-        do
-        {
-            learnRatePerSample *= 0.618;
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            learnRatePerSample, m_mbSize[epochNumber], featureNodes,
-                                            labelNodes, criterionNodes,
-                                            evaluationNodes, inputMatrices,
-                                            learnableNodes, smoothedGradients,
-                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
-                                            /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
-
-                    } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
-
-        bestLearnRatePerSample = learnRatePerSample;
-
-        //grid search for the first m_numBestSearchEpoch  epochs
-        if (epochNumber < m_numBestSearchEpoch)
-        {
-            double leftLearnRatePerSample = 0.01 / m_mbSize[epochNumber];
-            double rightLearnRatePerSample = learnRatePerSample;
-            double leftCriterion, rightCriterion = epochCriterion;
-
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            leftLearnRatePerSample, m_mbSize[epochNumber],
-                                            featureNodes, labelNodes,
-                                            criterionNodes, evaluationNodes,
-                                            inputMatrices, learnableNodes,
-                                            smoothedGradients, /*out*/ leftCriterion,
-                                            /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                                            "DetailBaseAdaptiveLearnRateSearch:");
-
-            while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2)
-            {
-                if (rightCriterion > leftCriterion)
-                {
-                    rightLearnRatePerSample *= 0.618;
-
-                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, numFramesToUseInSearch,
-                                                    trainSetDataReader,
-                                                    rightLearnRatePerSample, m_mbSize[epochNumber],
-                                                    featureNodes, labelNodes,
-                                                    criterionNodes,
-                                                    evaluationNodes,
-                                                    inputMatrices,
-                                                    learnableNodes,
-                                                    smoothedGradients,
-                                                    /*out*/ rightCriterion,
-                                                    /*out*/ epochEvalErrors,
-                                                    /*out*/ totalSamplesSeen,
-                                                    "DetailRightAdaptiveLearnRateSearch:");
-                }
-                else
-                {
-                    leftLearnRatePerSample /= 0.618;
-
-                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, numFramesToUseInSearch,
-                                                    trainSetDataReader,
-                                                    leftLearnRatePerSample, m_mbSize[epochNumber],
-                                                    featureNodes, labelNodes,
-                                                    criterionNodes,
-                                                    evaluationNodes,
-                                                    inputMatrices,
-                                                    learnableNodes,
-                                                    smoothedGradients,
-                                                    /*out*/ leftCriterion,
-                                                    /*out*/ epochEvalErrors,
-                                                    /*out*/ totalSamplesSeen,
-                                                    "DetailLeftAdaptiveLearnRateSearch:");
-                }
-            }
-
-            bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample :
-                                                                        rightLearnRatePerSample;
-        }
-
-        fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n",
-                epochNumber + 1, bestLearnRatePerSample, baseCriterion);
-
-        return bestLearnRatePerSample;
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetwork& net,
-                                         ComputationNetwork& refNet,
-                                         const ComputationNodeBasePtr refNode, const int epochNumber,
-                                         const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
-                                         const double learnRatePerSample,
-                                         const size_t minibatchSize,
-                                         const std::vector<ComputationNodeBasePtr> & featureNodes,
-                                         const std::vector<ComputationNodeBasePtr> & labelNodes,
-                                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
-                                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                                         const std::list<ComputationNodeBasePtr> & learnableNodes,
-                                         std::list<Matrix<ElemType>>& smoothedGradients,
-                                         /*out*/ double& epochCriterion,
-                                         /*out*/ std::vector<double>& epochEvalErrors,
-                                         /*out*/ size_t& totalSamplesSeen,
-                                         std::string prefixMsg)
-    {
-        TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
-                      trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
-                      labelNodes, criterionNodes, evaluationNodes,
-                      inputMatrices, learnableNodes, smoothedGradients,
-                      /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
-                      prefixMsg);
-
-        fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
-
-        if (epochEvalErrors.size() == 1)
-            fprintf(stderr, "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g\n", epochEvalErrors[0], learnRatePerSample);
-        else
-        {
-            fprintf(stderr, "EvalErrPerSample ");
-            for (size_t i = 0; i < epochEvalErrors.size(); i++)
-                fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
-            fprintf(stderr, "Ave LearnRatePerSample = %.10g\n", learnRatePerSample);
-        }
-
-        int baseModelEpoch = epochNumber - 1;
-        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
-        net.ResetEvalTimeStamp();
-
-        double dummyLearnRate;
-        double dummtPrevCriterion;
-        size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch,
-                           /*out*/ totalSamplesSeen,
-                           /*out*/ dummyLearnRate,
-                           smoothedGradients,
-                           /*out*/ dummtPrevCriterion,
-                           /*out*/ dummyMinibatchSize);
-    }
-
-    template<class ElemType>
-    size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetwork& net,
-                                   ComputationNetwork& refNet,
-                                   const ComputationNodeBasePtr refNode,
-                                   const int epochNumber,
-                                   const size_t numFramesToUseInSearch,
-                                   IDataReader<ElemType>* trainSetDataReader,
-                                   const double learnRatePerSample,
-                                   const size_t initialMinibatchSize,
-                                   const std::vector<ComputationNodeBasePtr> & featureNodes,
-                                   const std::vector<ComputationNodeBasePtr> & labelNodes,
-                                   const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                                   const std::vector<ComputationNodeBasePtr> & evaluationNodes,
-                                   std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                                   const std::list<ComputationNodeBasePtr> & learnableNodes,
-                                   std::list<Matrix<ElemType>>& smoothedGradients,
-                                   const double learningRateAdjustmentFactor)
-    {
-        size_t minMinibatchSize = initialMinibatchSize;
-        size_t chosenMinibatchSize = initialMinibatchSize;
-
-        // do some pre-adjustment based on LR
-        // Basically we assume that the LR for epoch 1 is safe for mbsize.
-        // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
-        double learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0];
-        learningRateChangeSoFar *= learningRateAdjustmentFactor;
-
-        // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
-        learningRateChangeSoFar = sqrt(learningRateChangeSoFar);
-
-        // LR was indeed reduced
-        if (learningRateChangeSoFar < 1.0f)
-        {
-            // we can safely increase MB size (note: this may be bigger than our max)
-            minMinibatchSize = (size_t)(minMinibatchSize / learningRateChangeSoFar);
-        }
-
-        if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
-        {
-            // newly started training: any previous MB size stored in the model is to be ignored
-            fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
-                    "considered invalid -> resetting\n", m_prevChosenMinibatchSize);
-            m_prevChosenMinibatchSize = 0;
-        }
-
-        // check if we need to skip
-        if (m_prevChosenMinibatchSize != 0 &&
-            (epochNumber + 1) > m_minibatchSizeTuningFrequency &&
-            (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
-        {
-            fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
-                    "in epoch %d skipped, keeping minibatchSize of %zd\n",
-                    epochNumber + 1, m_prevChosenMinibatchSize);
-            chosenMinibatchSize = m_prevChosenMinibatchSize;
-        }
-        else
-        {
-            if (m_prevChosenMinibatchSize != 0)
-            {
-                // if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
-                // is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
-                // then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
-                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
-                        "largest of previous minibatchSize = (%d / 2) or %d\n",
-                        (int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
-                minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
-            }
-
-            size_t maxMinibatchSize = m_minibatchSizeTuningMax;
-
-            // only grow at most 2 x compared to previous step
-            if (m_prevChosenMinibatchSize != 0.0f)
-            {
-                assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
-
-                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
-                        "previous minibatchSize %zd*2\n", m_prevChosenMinibatchSize);
-                maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
-            }
-
-            chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber,
-                                                             numFramesToUseInSearch, trainSetDataReader,
-                                                             learnRatePerSample, featureNodes,
-                                                             labelNodes, criterionNodes,
-                                                             evaluationNodes, inputMatrices,
-                                                             learnableNodes, smoothedGradients,
-                                                             minMinibatchSize, maxMinibatchSize);
-        }
-
-        return chosenMinibatchSize;
-    }
-
-    static size_t RoundToMultipleOf64(float val)
-    {
-        return 64 * (size_t)((val + 32) / 64);
-    }
-
-    static size_t RoundToMultipleOf64(size_t val)
-    {
-        return 64 * ((val + 32) / 64);
-    }
-
-    // uses a small percentage of training data of minibatch to
-    // speculatively train with various MB sizes; then picks the best
-    template<class ElemType>
-    size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetwork& net,
-                                      ComputationNetwork& refNet,
-                                      const ComputationNodeBasePtr refNode,
-                                      const int epochNumber,
-                                      const size_t numFramesToUseInSearch,
-                                      IDataReader<ElemType>* trainSetDataReader,
-                                      const double learnRatePerSample,
-                                      const std::vector<ComputationNodeBasePtr> & featureNodes,
-                                      const std::vector<ComputationNodeBasePtr> & labelNodes,
-                                      const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                                      const std::vector<ComputationNodeBasePtr> & evaluationNodes,
-                                      std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                                      const std::list<ComputationNodeBasePtr> & learnableNodes,
-                                      std::list<Matrix<ElemType>>& smoothedGradients,
-                                      const size_t minMinibatchSize, const size_t maxMinibatchSize)
-    {
-        // may happen for automatically reduced learning rates
-        if (minMinibatchSize > maxMinibatchSize)
-        {
-            return maxMinibatchSize;
-        }
-
-        size_t trialMinibatchSize = 0;
-        bool isFirstIteration = true;
-        double baseCriterion = 0;
-
-        // increase the minibatch size by a factor of sqrt(2) in each step.
-        const float minibatchSizeTuningFactor = sqrtf(2.0f);
-
-        size_t lastTriedTrialMinibatchSize = 0;
-        double lastTriedTrialEpochCriterion = 0;
-        for (float trialMinibatchSizeFloat = (float)minMinibatchSize;
-             trialMinibatchSizeFloat <= maxMinibatchSize;
-             trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
-        {
-            // round mbsize to something meaningful
-            trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
-
-            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
-                    trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
-
-            size_t totalSamplesSeen;
-            std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
-            double epochCriterion = std::numeric_limits<double>::infinity();
-
-            // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
-            // minibatches with iteration of this loop.
-            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            numFramesToUseInSearch, trainSetDataReader,
-                                            learnRatePerSample, trialMinibatchSize, featureNodes,
-                                            labelNodes, criterionNodes,
-                                            evaluationNodes, inputMatrices,
-                                            learnableNodes, smoothedGradients,
-                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
-                                            /*out*/ totalSamplesSeen,
-                                            isFirstIteration ? "BaseAdaptiveMinibatchSearch:" :
-                                                               "AdaptiveMinibatchSearch:");
-
-            if (isFirstIteration)
-            {
-                // for the first iteration of the loop only, set baseCriterion
-                // to the result we got from TrainOneMiniEpochAndReloadModel().
-                baseCriterion = epochCriterion;
-                lastTriedTrialMinibatchSize = trialMinibatchSize;
-                lastTriedTrialEpochCriterion = baseCriterion;
-                isFirstIteration = false;
-
-                fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
-            }
-            else if (!std::isnan(epochCriterion) &&
-                     (epochCriterion > (baseCriterion *  (1.0 + ( m_minibatchSearchCriterionErrorMargin / 100.0)))))
-            {
-                // As soon as we see the Criterion (a measure of error) start to get larger than the
-                // Criterion we started with, we stop.
-                // TODO: if this is too sensitive, we can add a margin on the bases of percentage of
-                // baseCriterion.
-                break;
-            }
-            else
-            {
-                lastTriedTrialMinibatchSize = trialMinibatchSize;
-                lastTriedTrialEpochCriterion = epochCriterion;
-                if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
-                {
-                   fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
-                           "EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
-                           epochCriterion, baseCriterion);
-                }
-            }
-        }
-        fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
-                "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
-                (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
-
-
-        return lastTriedTrialMinibatchSize;
-    }
-
-    // Tries to compute derivatives for the whole utterances, which will be
-    // fed to the neural network as features.
-    template<class ElemType>
-    void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetwork& net,
-                                            IDataReader<ElemType>* trainSetDataReader,
-                                            const std::vector<ComputationNodeBasePtr> & featureNodes,
-                                            std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
-    {
-        // Tries to read an utterance and run forward computation on the
-        // whole utterance.
-        assert(trainSetDataReader != NULL);
-        std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
-        auto pMBLayout = make_shared<MBLayout>();
-        while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, pMBLayout))
-        {
-            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
-
-            auto & outputNodes = net.OutputNodes();
-            if (outputNodes.empty())
-                LogicError("no output node was found.");
-
-            net.SetActualMiniBatchSizeFromFeatures();
-            trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
-            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
-            net.Evaluate(outputNodes[0]);   // Only evaluate the first output
-            trainSetDataReader->SetNetOutput(uttInfo,
-                                             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[0])->FunctionValues(),
-                                             pMBLayout);
-        }
-    }
-
-    static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value)
-    {
-        char format[16];
-        char buffer[512];
-
-        sprintf(format, "%%.%dg", precision);
-        sprintf(buffer, format, value);
-
-        for (int i = 0; i < strlen(buffer); i++)
-        {
-            if (buffer[i] == 'e' || buffer[i] == 'E')
-            {
-                sprintf(format, "%%%d.%de", padSize, precision);
-                return format;
-            }
-        }
-        sprintf(format, "%%%d.%df", padSize, precision);
-        return format;
-    }
-
-    template<class ElemType>
-    size_t SGD<ElemType>::TrainOneEpoch(ComputationNetwork& net,
-                         ComputationNetwork& refNet,
-                         const ComputationNodeBasePtr refNode,
-                         const int epochNumber,
-                         const size_t epochSize,
-                         IDataReader<ElemType>* trainSetDataReader,
-                         const double learnRatePerSample,
-                         size_t tunedMBSize,
-                         const std::vector<ComputationNodeBasePtr> & featureNodes,
-                         const std::vector<ComputationNodeBasePtr> & labelNodes,
-                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
-                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
-                         const std::list<ComputationNodeBasePtr> & learnableNodes,
-                         std::list<Matrix<ElemType>>& smoothedGradients,
-                         /*out*/ double& epochCriterion,
-                         /*out*/ std::vector<double>& epochEvalErrors,
-                         /*out*/ size_t& totalSamplesSeen,
-                         std::string prefixMsg)
-    {
-        // Since we are getting timing resolution of under microsecond we use double precision
-        // to ensure that we have enough digits to represent small time measurements.
-        double totalTimeInMBs = 0;
-        double epochCriterionLastMBs = 0;
-
-        int numSamplesLastMBs = 0;
-        std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
-
-        // initialize statistics
-        size_t totalEpochSamples = 0;
-
-        int numMBsRun = 0;
-
-        size_t numEvalNodes = epochEvalErrors.size();
-
-        // NOTE: the following two local matrices are not used in distGradAgg path
-        // assume only one training criterion node for each epoch
-
-        Matrix<ElemType> localEpochCriterion(1, 1, net.GetDeviceId());
-        Matrix<ElemType> localEpochEvalErrors(1, numEvalNodes, net.GetDeviceId());
-
-        localEpochCriterion.SetValue(0);
-        localEpochEvalErrors.SetValue(0);
-
-        bool useGradientAggregation = ((m_parallelizationMethod == ParallelizationMethod::DataParallelSGD) &&
-                                       (epochNumber >= m_parallelizationStartEpochNum));
-        bool useModelAveraging = ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) &&
-                                  (epochNumber >= m_parallelizationStartEpochNum));
-        bool useParallelTrain = useGradientAggregation || useModelAveraging; 
-
-        // MA-related variables
-        size_t nSamplesSinceLastModelSync = 0;
-        size_t nSynced = 0; 
-        float  nSecondsOnMASync = 0; 
-        float  nSecondsSinceLastMAPerfReport = 0;
-
-        if (useGradientAggregation)
-        {
-            epochCriterion = double(0.0);
-            epochEvalErrors.assign(numEvalNodes, double(0.0));
-        }
-
-        Profiler profiler(m_numMBsToCUDAProfile);
-
-        // resetting this, so profiling is performed for one epoch only
-        m_numMBsToCUDAProfile = 0;
-
-        bool useDistributedMBReading = useParallelTrain &&
-                                       m_enableDistributedMBReading &&
-                                       trainSetDataReader->SupportsDistributedMBRead();
-        if (useDistributedMBReading)
-        {
-            trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(), g_mpi->NumNodesInUse(), m_epochSize);
-        }
-        else
-        {
-            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
-        }
-
-        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
-
-        fprintf(stderr, "\nStarting minibatch loop");
-        if (useGradientAggregation)
-        {
-            fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)", (int)g_mpi->CurrentNodeRank(), (int)g_mpi->NumNodesInUse(), (int)m_numGradientBits);
-        }
-
-        if (useDistributedMBReading)
-        {
-            fprintf(stderr, ", Distributed reading is ENABLED");
-        }
-        fprintf(stderr, ".\n");
-
-        Timer timer;
-        timer.Start();
-
-        // --- MAIN MINIBATCH LOOP
-
-        for (;;)
-        {
-            bool wasDataRead = trainSetDataReader->GetMinibatch(*inputMatrices);
-
-            if (useDistributedMBReading)
-            {
-                // In case of distributed reading, the current node needs to continue even with a minibatch size of 0 if any
-                // other node in the group has a non-zero size minibatch to process. This is needed to ensure that
-                // the gradient aggregation barriers do not get stuck and also to ensure that all nodes update their weights
-                // properly using the aggregate gradients from other nodes before moving on to the next epoch even though the current
-                // node itself may not have any gradient contribution.
-                std::array<int, 1> numNodesWithDataToProcess;
-                numNodesWithDataToProcess[0] = wasDataRead ? 1 : 0;
-                g_mpi->AllReduce(numNodesWithDataToProcess);
-
-                if (numNodesWithDataToProcess[0] == 0)
-                {
-                    break;
-                }
-            }
-            else if (!wasDataRead)
-            {
-                break;
-            }
-
-            size_t actualMBSize = 0;
-            if (wasDataRead)
-            {
-                size_t nSlices = trainSetDataReader->GetNumParallelSequences();
-                MBLayoutPtr pMBLayout;
-                if (!useDistributedMBReading && useParallelTrain)
-                {
-                    // TODO: refactor this as a function 
-                    if (trainSetDataReader->RequireSentenceSeg())
-                    {
-                        pMBLayout = make_shared<MBLayout>();    // items get filled in
-                        DecimateMinibatchWithSentences(*inputMatrices,
-                                                       g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank(),
-                                                       nSlices, pMBLayout,
-                                                       trainSetDataReader);
-                    }
-                    else
-                    {
-                        DecimateMinibatch(*inputMatrices, g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank());
-                    }
-                }
-
-                actualMBSize = net.SetActualMiniBatchSizeFromFeatures();
-                if (actualMBSize != 0)
-                {
-                    if (!useDistributedMBReading && useParallelTrain && trainSetDataReader->RequireSentenceSeg())
-                    {
-                        *net.GetMBLayoutPtr() = *pMBLayout;
-                        // TODO: ^^ we should just pass pointers; this current code is semantically identical to before the change to MBLayout
-                        net.VerifyActualNumParallelSequences(nSlices);
-                    }
-                    else
-                    {
-                        trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
-                        net.VerifyActualNumParallelSequences(nSlices);
-                    }
-
-                    nSamplesSinceLastModelSync += actualMBSize;
-
-                    ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
-                    ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
-
-#ifndef EVALDLL
-                    if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
-                        LogicError("cannot pass gradient checker");
-#endif
-                    // TODO: currently only support one node regularization
-                    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
-                    {
-#if 1
-                        size_t actualMBSize2 = refNet.SetActualMiniBatchSizeFromFeatures();
-                        if (actualMBSize2 != actualMBSize)
-                            LogicError("TrainOneEpoch: refNet has different MB size than main net??");
-#else
-                        refNet.SetActualMiniBatchSize(actualMBSize);            // TODO: SetActualMiniBatchSizeFromFeatures() should have the same result, no?
-#endif
-                        *refNet.GetMBLayoutPtr() = *net.GetMBLayoutPtr();       // TODO: This is UNTESTED (before this was missing, seemingly inconsistently)
-                        refNet.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
-
-                        refNet.Evaluate(refNode);
-                        Matrix<ElemType>::ScaleAndAdd((ElemType)m_adaptationRegWeight,
-                                                      dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->FunctionValues(),
-                                                      (ElemType)(1.0 - m_adaptationRegWeight),
-                                                      dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[0])->FunctionValues());
-                    }
-
-                    //compute eval node first since when gradient is computed the forward function values
-                    //may be changed and need to be recomputed when gradient and function value share the same matrix
-                    for (size_t i = 0; i < numEvalNodes; i++)
-                    {
-                        net.Evaluate(evaluationNodes[i]);
-                    }
-
-                    // only compute gradient when learning rate is large enough
-                    if (learnRatePerSample > m_minLearnRate * 0.01)
-                    {
-                        // use only the first criterion. Is there any possibility to use more?
-                        net.ComputeGradient<ElemType>(criterionNodes[0]);
-                    }
-                    else
-                    {
-                        // use only the first criterion. Is there any possibility to use more?
-                        net.Evaluate(criterionNodes[0]);
-                    }
-                }
-            }
-
-            //for now since we share the same label masking flag we call this on the network. 
-            //Later, when we apply different labels on different nodes
-            //we need to add code to call this function multiple times, one for each criteria node
-            size_t numSamplesWithLabel = net.GetNumSamplesWithLabel(actualMBSize);
-
-            // Sum of actualMBSize across all nodes when using parallel training
-            size_t aggregateNumSamples = actualMBSize;
-            size_t aggregateNumSamplesWithLabel = numSamplesWithLabel;
-
-            //distributed gradient aggregation
-            if (!useGradientAggregation)
-            {
-                if (actualMBSize != 0)
-                {
-                    Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->FunctionValues(), 0, 0, localEpochCriterion, 0, 0);
-                    for (size_t i = 0; i < numEvalNodes; i++)
-                        Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i);
-                }
-            }
-            else
-            {
-                LazyInitDistGradAgg(learnableNodes, numEvalNodes, m_traceLevel);
-
-                //prepare the header
-                m_gradHeader->numEvalNode = numEvalNodes;
-                m_gradHeader->numSamples = actualMBSize;
-                m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
-                m_gradHeader->criterion = wasDataRead ? criterionNodes[0]->Get00Element() : 0.0;
-                for (size_t i = 0; i < numEvalNodes; i++)
-                    m_gradHeader->evalErrors[i] = wasDataRead ? evaluationNodes[i]->Get00Element() : 0.0;
-
-                m_distGradAgg->AggregateGradients(m_gradHeader, epochNumber);
-
-                aggregateNumSamples = m_gradHeader->numSamples;
-                aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
-                epochCriterion += m_gradHeader->criterion;
-                for (size_t i = 0; i<numEvalNodes; i++)
-                    epochEvalErrors[i] += m_gradHeader->evalErrors[i];
-                }
-
-            //update model parameters
-            if ((aggregateNumSamples > 0) && (learnRatePerSample > m_minLearnRate * 0.01))
-            {
-                auto smoothedGradientIter = smoothedGradients.begin();
-                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
-                {
-                    ComputationNodeBasePtr node = *nodeIter;
-                    Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
-
-                    UpdateWeights(node, smoothedGradient, learnRatePerSample,
-                                  m_momentumPerSample[epochNumber], aggregateNumSamples,
-                                  m_L2RegWeight, m_L1RegWeight,
-                                  m_needAveMultiplier);
-                }
-            }
-    
-            if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
-            {
-                size_t processedSamples = 0; 
-                float secondsSinceLastSyncFinished = 0; 
-                float secondsSpentOnSync = 0;
-                if (ModelAveragingProcessing(nSamplesSinceLastModelSync, learnableNodes, processedSamples,
-                                             secondsSinceLastSyncFinished, secondsSpentOnSync))
-                {
-                    // if a sync happens, do some extra work
-                    nSamplesSinceLastModelSync = 0; 
-                    nSynced++;
-
-                    nSecondsOnMASync += secondsSpentOnSync; 
-                    nSecondsSinceLastMAPerfReport += secondsSinceLastSyncFinished; 
-                    
-                    if (m_iMASyncStatsTrace > 0)
-                    {
-                        if (nSynced % m_iMASyncStatsTrace == 0)
-                        {
-                            fprintf(stderr, "\t\t-----(model averaging stats) %d-th sync, %8.2f seconds since last report, %5.2f seconds on communication\n",
-                                    (int)nSynced, nSecondsSinceLastMAPerfReport, nSecondsOnMASync);
-                            nSecondsOnMASync = 0; 
-                            nSecondsSinceLastMAPerfReport = 0; 
-                        }
-                    }
-                }
-                aggregateNumSamplesWithLabel = processedSamples;
-            }
-
-            timer.Stop();
-            numMBsRun++;
-            if (m_traceLevel > 0)
-            {
-                totalTimeInMBs += timer.ElapsedSeconds();
-                numSamplesLastMBs += useModelAveraging ? int(actualMBSize) : int(aggregateNumSamplesWithLabel);
-
-                if (numMBsRun % m_numMBsToShowResult == 0)
-                {
-                    // get the epoch Values updated
-                    if (!useGradientAggregation)
-                    {
-                        timer.Restart();
-                        epochCriterion = localEpochCriterion.Get00Element();
-                        for (size_t i = 0; i < numEvalNodes; i++)
-                            epochEvalErrors[i] = localEpochEvalErrors(0, i);
-                        timer.Stop();
-
-                        // Add the last trailing compute
-                        totalTimeInMBs += timer.ElapsedSeconds();
-                    }
-
-                    double trainLossPerSample = (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs;
-                    string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d of %d]: SamplesSeen = %d; TrainLossPerSample = " +
-                                          GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
-                    fprintf(stderr, formatString.c_str(),
-                            prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
-                            numMBsRun, epochSize / tunedMBSize, numSamplesLastMBs, trainLossPerSample);
-
-                    for (size_t i = 0; i < numEvalNodes; i++)
-                    {
-                        double evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
-                        formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
-                        fprintf(stderr, formatString.c_str(), i, evalError);
-                    }
-
-                    double totalTimePerSample = (1000.0 * totalTimeInMBs) / numSamplesLastMBs;
-                    formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 5, totalTimeInMBs) + "s; TotalTimePerSample = " +
-                                   GeneratePaddedFloatOrExpFormat(0, 5, totalTimePerSample) + "ms; SamplesPerSecond = %d\n";
-                    fprintf(stderr, formatString.c_str(),
-                            totalTimeInMBs, totalTimePerSample,
-                            static_cast<int>(numSamplesLastMBs / totalTimeInMBs));
-
-                    fflush(stderr);
-
-                    // reset statistics
-                    totalTimeInMBs = 0;
-                    numSamplesLastMBs = 0;
-
-                    epochCriterionLastMBs = epochCriterion;
-                    for (size_t i = 0; i < numEvalNodes; i++)
-                        epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
-
-                    if (std::isnan(epochCriterion))
-                        RuntimeError("The training criterion is not a number (NAN). Stop\n");
-                    }
-                }
-
-            timer.Restart();
-            totalEpochSamples += aggregateNumSamplesWithLabel;
-            totalSamplesSeen += aggregateNumSamplesWithLabel;
-
-            if (totalEpochSamples >= epochSize)
-                break;
-
-            // call DataEnd function
-            // DataEnd does reader specific process if sentence ending is reached
-            trainSetDataReader->DataEnd(endDataSentence);
-
-            // Tries to set up derivative features for the next utterance.
-            AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
-
-            profiler.NextSample();
-        }
-
-        // --- END MAIN MINIBATCH LOOP
-
-        if (useModelAveraging && (g_mpi->NumNodesInUse() > 1) )
-        {
-            // may not be synced after epoch finished, so do the sync here 
-            int residualSampels = (int)nSamplesSinceLastModelSync;
-            g_mpi->AllReduce(&residualSampels, 1);
-            totalSamplesSeen += residualSampels; 
-            totalEpochSamples += residualSampels;
-            ModelAveragingSync(nSamplesSinceLastModelSync, learnableNodes);
-            nSynced++;
-            nSamplesSinceLastModelSync = 0;
-        }
-
-        if (useGradientAggregation)
-        {
-            epochCriterion /= float(totalEpochSamples);
-            for (size_t i = 0; i< numEvalNodes; i++)
-                epochEvalErrors[i] /= totalEpochSamples;
-        }
-        else
-        {
-            localEpochCriterion /= float(totalEpochSamples);
-            localEpochEvalErrors /= float(totalEpochSamples);
-
-            epochCriterion = localEpochCriterion.Get00Element();
-            for (size_t i = 0; i < numEvalNodes; i++)
-                epochEvalErrors[i] = localEpochEvalErrors(0, i);
-        }
-
-
-        if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
-        {
-            // merge epochCriterion and epochEvalErrors over nodes 
-            g_mpi->AllReduce(&epochCriterion, 1);
-            g_mpi->AllReduce(epochEvalErrors);
-        }
-        return totalEpochSamples;
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::LazyInitDistGradAgg(const std::list<ComputationNodeBasePtr>& learnableNodes, int numEvalNodes, int traceLevel)
-    {
-        if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
-        {
-            if (m_distGradAgg == nullptr)
-            {
-                std::vector<Matrix<ElemType>*> learnParamsGradients;
-                learnParamsGradients.reserve(learnableNodes.size());
-                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-                {
-                    ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-                    learnParamsGradients.push_back(&(node->GradientValues()));
-                }
-
-                m_distGradAgg = new AllReduceDistGradAggregator<ElemType>(learnParamsGradients, numEvalNodes, m_numGradientBits, g_mpi, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, traceLevel);
-            }
-
-            if (m_gradHeader == nullptr)
-            {
-                m_gradHeader = DistGradHeader::Create(numEvalNodes);
-            }
-        }
-    }
-
-    template<class ElemType>
-    bool SGD<ElemType>::ModelAveragingProcessing(size_t nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes, size_t& nProcessedFrames,
-                                  float& SecondsSinceLastSyncFinished, float& SecondsSpentOnSync)
-    {
-        //////////////////////////////////////////////////////////////////////////
-        // the current strategy is that after each minibatch, we will sync between processors 
-        // to decide whether a sync need to be performed. This is definitely not optimal, 
-        // which we will fix it later. 
-
-        // TODO: the way we handle timer is not very good 
-        //////////////////////////////////////////////////////////////////////////
-        static bool first = true ; 
-        static Timer MAtimer;
-        if (first)
-        {
-            MAtimer.Start(); 
-            first = false; 
-        }
-       
-        char bNeedToSync = (char)0; // use char for bool 
-        if (g_mpi->IsMainNode() && nSamplesSinceLastSync >= m_nFramesBetweenMASync)
-        {
-            // only the main node can decide whether a sync need to be performed 
-            bNeedToSync = (char)1; 
-        }
-        g_mpi->Bcast(&bNeedToSync, 1, g_mpi->MainNodeRank());
-        if (bNeedToSync)
-        {
-            MAtimer.Stop();
-            double elapsedsec = MAtimer.ElapsedSeconds(); 
-            SecondsSinceLastSyncFinished = first ?  0  : (float) elapsedsec  ;
-            MAtimer.Start();
-            nProcessedFrames = ModelAveragingSync((int)nSamplesSinceLastSync, learnableNodes);
-            MAtimer.Stop();
-            SecondsSpentOnSync = (float)MAtimer.ElapsedSeconds();
-            
-            MAtimer.Start();
-        }
-        else
-        {
-            nProcessedFrames = 0; 
-            return false;
-        }
-        return true; 
-    }
-
-    template<class ElemType>
-    size_t SGD<ElemType>::ModelAveragingSync(int nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes)
-    {
-        if (g_mpi->NumNodesInUse() <= 1)
-        {
-            return nSamplesSinceLastSync; 
-        }
-
-        //========================================
-        // Sec. 1 calculate factor
-        //========================================
-        float factor = 0; 
-        int   nTotalSamples = nSamplesSinceLastSync; 
-        g_mpi->AllReduce(&nTotalSamples, 1);
-        if (nTotalSamples <= 0)
-        {
-            // prepare for overflow 
-            factor = 1.0f / g_mpi->NumNodesInUse(); 
-        }
-        else
-        {
-            factor = (nSamplesSinceLastSync + 0.0f) / nTotalSamples; 
-        }
-
-        //========================================
-        // Sec. 2 sync models based on factor 
-        // Note: this is suboptimal at the moment: 
-        //       we do the averaging for each node in a sequence manner, i.e., 
-        //          (node1) GPU->CPU->MPI_AllReduce -> (node2)GPU->CPU->MPI_AllReduce
-        //       we can improve it by using a pipeline 
-        //          (node1) GPU ->  CPU  ->  MPI_AllReduce
-        //          (node2)         GPU  ->  CPU            -> MPI_AllReduce
-        //          (node3)                  GPU            -> CPU              -> MPI_AllReduce
-        //========================================
-        for (auto iter = learnableNodes.begin(); iter != learnableNodes.end(); iter++)
-        {
-            ComputationNodeBasePtr pNode = *iter; 
-            if (!pNode->NeedGradient())
-                continue;
-
-            Matrix<ElemType>& mat = dynamic_pointer_cast<ComputationNode<ElemType>>(pNode)->FunctionValues();
-            // 1. normalize the weight matrix 
-            Matrix<ElemType>::Scale(factor, mat);
-            // 2. send weight matrix over MPI nodes; 
-            ElemType* px = mat.CopyToArray(); 
-            size_t    nx = mat.GetNumElements(); 
-
-            // 3. inplace sum 
-            g_mpi->AllReduce(px, nx);
-            mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), px);
-            // 4. clean up 
-            delete []px; 
-        }
-
-        return nTotalSamples; 
-    }
-    
-// public:
-    // UpdateWeightsS - static version of UpdateWeights()
-    // not static since it wants to access protected methods on the SGD object
-    template<class ElemType>
-    /*static*/ void SGD<ElemType>::UpdateWeightsS(const SGD<ElemType>* sgd, Matrix<ElemType>& functionValues,
-                               Matrix<ElemType>& gradientValues,
-                               Matrix<ElemType>& smoothedGradient,
-                               const double learnRatePerSample,
-                               const double momentumPerSample,
-                               size_t actualMBSize,
-                               const double L2RegWeight,
-                               const double L1RegWeight,
-                               const bool needAveMultiplier)
-    {
-        // we use simple linear (instead of log linear) scaling here
-        const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
-#if DUMPOUTPUT
-        fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
-                learnRatePerSample, momentum, actualMBSize);
-        fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
-                sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
-        gradientValues.Print("Gradient Input");
-        smoothedGradient.Print("Smoothed Gradient Input");
-#endif
-
-        // make actualMBSize is a valid value
-        assert(actualMBSize > 0);
-
-        //clipping gradients to prevent outliers
-        sgd->ClipGradient(gradientValues, actualMBSize);
-
-        GradientsUpdateType adpType = sgd->GradUpdateType();
-        double noiseStd = sgd->GradientUpdateNoiseStd();
-        Matrix<ElemType> sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId());
-        if (noiseStd > 0)
-        {
-            // get the gradient structure since gradient is sparse
-            sgdUpdateNoise.SetValue(gradientValues);
-
-            // reset its value to random
-            sgdUpdateNoise.SetGaussianRandomValue(0, (ElemType)noiseStd);
-        }
-
-        // L2 regularizer
-        if (L2RegWeight > 0)
-        {
-            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(L2RegWeight * actualMBSize), functionValues, gradientValues);
-        }
-
-        if (adpType == GradientsUpdateType::None)
-        {
-            smoothedGradient.NormalGrad(gradientValues, functionValues,
-                                        (ElemType)learnRatePerSample, (ElemType)momentum);
-        }
-        else if (adpType == GradientsUpdateType::AdaGrad ||
-                (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE))
-        {
-            //rmsprop for sparse is not implemented yet, delegate it with adagrad
-
-            double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
-        }
-        else if (adpType == GradientsUpdateType::RmsProp)
-        {
-            double aveMultiplier = smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma,
-                                                              (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max,
-                                                              (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min, needAveMultiplier);
-            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
-        }
-
-        if (noiseStd > 0)
-        {
-            Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
-        }
-
-        // L1 regularizer with proximal gradient descent method
-        if (L1RegWeight > 0)
-        {
-            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
-            functionValues.InplaceSoftThreshold((ElemType)(learnRatePerSample * L1RegWeight * actualMBSize));
-        }
-
-#if DUMPOUTPUT
-        functionValues.Print("Parameter Update");
-#endif
-    }
-
-// protected:
-
-    // UpdateWeights - update the weights in
-    template<class ElemType>
-    void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr node,
-                       Matrix<ElemType>& smoothedGradient,
-                       const double learnRatePerSample,
-                       const double momentumPerSample,
-                       const size_t actualMBSize,
-                       const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const
-    {
-#if DUMPOUTPUT
-        fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
-#endif
-        UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->FunctionValues(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->GradientValues(),
-                       smoothedGradient, learnRatePerSample, momentumPerSample,
-                       actualMBSize, L2RegWeight, L1RegWeight,
-                       needAveMultiplier);
-        node->UpdateEvalTimeStamp();
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
-    {
-        if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
-        {
-            double maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
-            if (m_gradientClippingWithTruncation)
-                gradient.InplaceTruncate((ElemType)(maxGradientPerMB));
-            else
-            {
-                // norm2 normalized
-                double gradientNorm = gradient.FrobeniusNorm();
-                if (gradientNorm > maxGradientPerMB)
-                {
-                    double normFactor = maxGradientPerMB / gradientNorm;
-                    gradient *= (ElemType)normFactor;
-                }
-            }
-        }
-    }
-
-    template<class ElemType>
-    void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
-                            const double learnRatePerSample,
-                            const std::list<Matrix<ElemType>>& smoothedGradients,
-                            const double prevCriterion,
-                            const size_t minibatchSize)
-    {
-        // In case of parallel training only the main node should we saving the checkpoint to prevent
-        // the parallel training nodes from colliding to write the same file
-        if ((g_mpi == nullptr) || g_mpi->IsMainNode())
-        {
-            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
-            // Saving into temporary file and then renaming it to the checkPointFileName
-            // This is a standard trick to avoid havign corrupted checkpoints files if process dies during writing
-            wstring tempFileName = checkPointFileName + L".tmp";
-
-            {
-                File fstream(tempFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
-
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-                fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
-
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
-                fstream << minibatchSize;
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
-
-                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
-
-                for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-                {
-                    const Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
-                    fstream << smoothedGradient;
-                }
-
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
-
-                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
-
-                // Ensuring that data is written
-                fstream.Flush();
-            }
-
-            renameOrDie(tempFileName, checkPointFileName);
-        }
-    }
-
-    template<class ElemType>
-    bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
-                            /*out*/ size_t& totalSamplesSeen,
-                            /*out*/ double& learnRatePerSample,
-                            std::list<Matrix<ElemType>>& smoothedGradients,
-                            /*out*/ double& prevCriterion,
-                            /*out*/ size_t& minibatchSize)
-    {
-        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
-        if (!fexists(checkPointFileName.c_str()))
-        {
-            fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
-            return false;
-        }
-
-        File fstream(checkPointFileName,
-                     FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
-
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
-        fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
-
-        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
-        {
-            fstream >> minibatchSize;
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
-        }
-        else
-        {
-            minibatchSize = m_mbSize[epochNumber];
-        }
-
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
-
-        for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
-        {
-            Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
-            fstream >> smoothedGradient;
-        }
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
-
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
-
-        return true;
-    }
-
-    template<class ElemType>
-    wstring SGD<ElemType>::GetCheckPointFileNameForEpoch(const int epoch)
-    {
-        return GetModelNameForEpoch(epoch) + L".ckp";
-    }
-
-    template<class ElemType>
-    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
-    {
-        int epoch1Base = epoch + 1;
-        if (epoch1Base == m_maxEpochs || bLastModel)
-        {
-            return m_modelPath;
-        }
-        else
-        {
-            wstring w = msra::strfun::wstrprintf(L"%ls.%d", m_modelPath.c_str(), (int)epoch1Base);
-            return w;
-        }
-
-    }
-
-    // return -1 if nothing exists
-    template<class ElemType> // TODO: needed?
-    int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
-    {
-        if (!makeMode)
-        {
-            // always start from scratch
-            return -1;
-        }
-
-        int firstEpoch = -1;
-
-        wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1);
-        for (int e = int(m_maxEpochs) - 1; e >= -1; e--)
-        {
-            const wstring prevEpochFile = GetModelNameForEpoch(e - 1);
-
-            if (msra::files::fuptodate(curEpochFile, prevEpochFile, false))
-            {
-                firstEpoch = size_t(e) + 1;
-                break;
-            }
-            else
-            {
-                curEpochFile = prevEpochFile;
-            }
-        }
-
-        return firstEpoch;
-    }
-
-#define EPSILON 1e-5
-
-    template<class ElemType>
-    bool SGD<ElemType>::GradientCheck(ComputationNetwork& net,
-                       const std::vector<ComputationNodeBasePtr> & criterionNodes,
-                       const std::list<ComputationNodeBasePtr> & learnableNodes,
-                       int npos)
-    {
-        vector<string> errMsgs;
-
-        // gradient checking
-        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-        {
-            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-            char wstrtmp[2048];
-
-            for (size_t itry = 0; itry < min((size_t)50, node->FunctionValues().GetNumElements()); itry++)
-            {
-                /// no support to sparse matrix yet
-                int irow = (int) fmod(rand(), node->FunctionValues().GetNumRows() - 1);
-                int icol = (int) fmod(rand(), node->FunctionValues().GetNumCols() - 1);
-                irow = max(0, irow);
-                icol = max(0, icol);
-
-                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
-
-                double eOrg = node->FunctionValues()(irow, icol);
-                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
-
-                node->UpdateEvalTimeStamp();
-
-                // use only the first criterion. Is
-                net.ComputeGradient<ElemType>(criterionNodes[npos]);
-
-                if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
-                {
-                    break;
-                }
-
-                //double mbEvalCri =
-                //criterionNode should be a scalar
-                // TODO: why is this value not used?
-                criterionNodes[npos]->Get00Element();
-                double eGradErr = node->GradientValues()(irow, icol);
-                node->GradientValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
-
-                double ePos = eOrg + EPSILON;
-                double eNeg = eOrg - EPSILON;
-
-                node->FunctionValues()(irow, icol) = (ElemType)ePos;
-                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
-
-                node->UpdateEvalTimeStamp();
-                net.Evaluate(criterionNodes[npos]);
-                //criterionNode should be a scalar
-
-                double mbEvalCriPos = criterionNodes[npos]->Get00Element(); // TODO: make Get00Element() a function of ComputationNodeBase
-
-                node->FunctionValues()(irow, icol) = (ElemType)eNeg;
-                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
-
-                node->UpdateEvalTimeStamp();
-                net.Evaluate(criterionNodes[npos]);
-
-                // criterionNode should be a scalar
-                double mbEvalCriNeg = criterionNodes[npos]->Get00Element();
-
-                // back to its orginal parameter value
-                node->FunctionValues()(irow, icol) = (ElemType)eOrg;
-                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
-
-                // check if they are consistent
-                double eGradNum = ((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
-                double threshold = pow(10.0,
-                                       max(0.0,
-                                                        ceil(log10(min(fabs(eGradErr),
-                                                    fabs(eGradNum))))) - (int)m_gradientCheckSigDigit);
-                double diff = fabs(eGradErr - eGradNum);
-                bool wrong = (std::isnan(diff) || diff > threshold);
-                if (wrong)
-                {
-                    fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
-                            node->NodeName().c_str(), eGradNum, eGradErr);
-                    sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
-                            node->NodeName().c_str(), eGradNum, eGradErr);
-                    errMsgs.push_back(wstrtmp);
-                }
-            }
-        }
-
-        return errMsgs.size() == 0;
-        }
-
-template class SGD<float>;
-template class SGD<double>;
-
-// TODO: does not build--but part is used directly from CNTK.cpp
-//template class MultiNetworksSGD<float>;
-//template class MultiNetworksSGD<double>;
-
-}}}
+// SGD.cpp -- implements SGD with all bells and whistles, parallelization, randomizatiom, etc.
+
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+
+#include "Basics.h"
+#include "SGD.h"
+#include "AllReduceDistGradAggregator.h"
+
+#include <map>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    using namespace std;
+
+    template<class ElemType>
+    void DecimateMinibatch(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*>& mb, int numProcessor, int myID)
+    {
+        int rank = myID;
+        int procs = numProcessor;
+
+        size_t rv = 0;
+        if (procs > 1)
+        {
+            for (auto it = mb.begin(); it != mb.end(); ++it)
+            {
+                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+                size_t nCols = mat.GetNumCols();
+                size_t col_start = (nCols * rank) / procs;
+                size_t col_end = (nCols * (rank + 1)) / procs;
+                if (col_end > nCols)
+                {
+                    // this shouldn't happen
+                    col_end = nCols;
+                }
+
+                if (col_end == col_start)
+                {
+                    MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), 0, AUTOPLACEMATRIX, DENSE);
+                    mat.SetValue(tmp);
+                }
+                else
+                {
+                    MSR::CNTK::Matrix<ElemType> tmp = mat.ColumnSlice(col_start, col_end - col_start);
+                    mat.SetValue(tmp);
+                }
+
+                if (rv == 0)
+                {
+                    rv = mat.GetNumCols();
+                }
+                else
+                {
+                    if (rv != mat.GetNumCols())
+                    {
+                        throw std::logic_error("Uneven number of columns among inputs.");
+                    }
+                }
+            }
+        }
+    }
+
+    template<class ElemType> 
+    size_t DecimateMinibatchWithSentences(std::map<std::wstring, MSR::CNTK::Matrix<ElemType>*> &mb,  /* (input) matrix to be decimated */
+                                          int rank, int numprocs,                                    /* (input) rank info */
+                                          size_t& nSlices,                                           /* (input/output): on input, # parallel sentence total , on output, # paralel sentence in this node  */
+                                          MBLayoutPtr pMBLayout,                                     // gets filled in
+                                          IDataReader<ElemType>* trainDataReader)                    /* (input)  to have access to reader */
+    {
+        // For RNN, a input Matrix is organized in the following way: 
+        //   | x_t^1  x_t^2 ... x_t^N |  .... | x_{t+T-1}^1 ... x_{t+T-1}^N | 
+        //   |<----   block 1    ---->|  .... |<------  block T       ----->| 
+        // N is the nSlice (input)
+        // The decimation here is to split each block to individual GPUs 
+        // So After decimation 
+        //   | x_t^{st} ... x_t^{en-1}|  .... | x_{t+T-1}^{st} ... x_{t+T-1}^{en-1} | 
+        // Each block now has nSlice/nProcs 
+        // 
+        // Correspondingly, the SentenceBoundary and PackingFlags will be revised 
+            trainDataReader->CopyMBLayoutTo(pMBLayout); // fill this
+
+        size_t rv = 0;
+        size_t nOrigParallelUtts = nSlices;
+        static bool warned = false;
+        if (numprocs > 1)
+        {
+            // decide new parallel utterances 
+            size_t sent_start = 0;
+            size_t sent_end = 0;
+            if (nOrigParallelUtts % numprocs != 0)
+            {
+                if (!warned)
+                {
+                    /* give a warning of potential bandwidth wasting */
+                    fprintf(stderr, "WARNING: %d GPUs are used in model averaging, but the number of parallel utterances are %d, a potential training speed degradation.\n",
+                            (int)g_mpi->NumNodesInUse(), (int)nOrigParallelUtts);
+                    warned = true;
+                }
+                if (rank == numprocs - 1)
+                {
+                    nSlices = nOrigParallelUtts - (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                    sent_start = (nOrigParallelUtts / numprocs + 1) * (numprocs - 1);
+                    sent_end = nOrigParallelUtts;
+                }
+                else
+                {
+                    nSlices = nOrigParallelUtts / numprocs + 1;
+                    sent_start = nSlices * rank;
+                    sent_end = nSlices * (rank + 1);
+                    if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
+                }
+            }
+            else
+            {
+                nSlices = nOrigParallelUtts / numprocs;
+                sent_start = rank*nSlices;
+                sent_end = (rank + 1)*nSlices;
+                if (sent_end > nOrigParallelUtts) sent_end = nOrigParallelUtts;
+            }
+            // decimate data 
+            for (auto it = mb.begin(); it != mb.end(); ++it)
+            {
+                MSR::CNTK::Matrix<ElemType> &mat = *(it->second);
+                size_t nCols = mat.GetNumCols();
+
+                if (nCols % nOrigParallelUtts != 0)
+                {
+                    // this should not happen for DNN, RNN with truncated BPTT, not sure about other special stuff ... 
+                    RuntimeError("ERROR: minibatch size %d, but with %d parallel utterances\n", nCols, nOrigParallelUtts);
+                }
+                size_t nBlocks = nCols / nOrigParallelUtts;
+                // for RNN, nBlocks is the size of truncated BPTT
+                if (sent_end == sent_start)
+                {
+                    // should never happen, print debug info
+                    RuntimeError("ERROR: in DecimateMinibatch, col_st=col_en=%d, nCol=%d, nBlock=%d, nParaUtts=%d, nGPU=%d\n",
+                        (int)sent_start, (int)nCols, (int)nBlocks, (int)nOrigParallelUtts, (int)numprocs);
+                }
+
+                MSR::CNTK::Matrix<ElemType> tmp(mat.GetNumRows(), nSlices*nBlocks, mat.GetPreferredDeviceId(), mat.GetMatrixType());
+
+                // do the column slice for each block 
+                for (size_t iblock = 0; iblock < nBlocks; iblock++)
+                {
+                    tmp.SetColumnSlice(mat.ColumnSlice(nOrigParallelUtts*iblock + sent_start, nSlices),
+                        iblock*nSlices, nSlices);
+                }
+                mat.SetValue(tmp);
+
+                // assert the cols are even among nodes 
+                if (0 == rv)
+                {
+                    rv = mat.GetNumCols();
+                }
+                else
+                {
+                    if (rv != mat.GetNumCols())
+                        throw std::logic_error("Uneven number of columns among inputs.");
+                }
+            }
+            // revise sentence boundary and packing flags
+            // TODO: get rid of this explicit matrix, this can be done directly with MBLayout types.
+            size_t nMBSize = pMBLayout->GetSize();
+            Matrix<float> newBoundary(CPUDEVICE);
+            newBoundary.Resize(nSlices, nMBSize);
+            newBoundary.AssignRowSliceValuesOf(pMBLayout->GetM(), sent_start, nSlices);
+            fill(pMBLayout->GetV().begin(), pMBLayout->GetV().end(), MinibatchPackingFlags::None);
+            for (size_t nt = 0; nt < nMBSize; nt++)
+            {
+                for (size_t ns = 0; ns < nSlices; ns++)
+                {
+                    if (newBoundary(ns, nt) == ((int)MinibatchPackingFlags::SequenceStart))
+                        pMBLayout->GetV()[nt] |= MinibatchPackingFlags::SequenceStart;
+                    if (newBoundary(ns, nt) == ((int)MinibatchPackingFlags::SequenceEnd))
+                        pMBLayout->GetV()[nt] |= MinibatchPackingFlags::SequenceEnd;
+                }
+            }
+        }
+
+        return rv; 
+    }
+
+    static AdaptationRegType ParseAdaptationRegType(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if (s == L"" || s == L"none")
+            return AdaptationRegType::None;
+        else if (s == L"kl" || s == L"klreg")
+            return AdaptationRegType::KL;
+        else
+            throw std::invalid_argument("ParseAdaptationRegType: Invalid Adaptation Regularization Type. Valid values are (None | KL)");
+        }
+
+    static GradientsUpdateType ParseGradUpdateType(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if (s == L"" || s == L"none" || s == L"normal" || s == L"simple")
+            return GradientsUpdateType::None;
+        else if (s == L"adagrad")
+            return GradientsUpdateType::AdaGrad;
+        else if (s == L"rmsprop")
+            return GradientsUpdateType::RmsProp;
+        else if (s == L"fsadagrad")
+            return GradientsUpdateType::FSAdaGrad;
+        else
+            throw std::invalid_argument("ParseGradUpdateType: Invalid Gradient Updating Type. Valid values are (None | AdaGrad | RmsProp | FSAdaGrad )");
+    }
+
+    static ParallelizationMethod ParseParallelizationMethod(wstring s)
+    {
+        msra::strfun::tolower_ascii(s);
+        if ((s == L"") || (s == L"none"))
+            return ParallelizationMethod::None;
+        else if (s == L"dataparallelsgd")
+            return ParallelizationMethod::DataParallelSGD;
+        else if (s == L"modelaveragingsgd")
+            return ParallelizationMethod::ModelAveragingSGD;
+        else
+            throw std::invalid_argument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (None | DataParallelSGD | ModelAveragingSGD)");
+        }
+
+    static LearningRateSearchAlgorithm ParseLearningRateSearchType(wstring s)
+    {
+        // TODO: why allow so many variants?
+        msra::strfun::tolower_ascii(s);
+        if (s == L"false" || s == L"none")
+            return LearningRateSearchAlgorithm::None;
+        else if (s == L"searchbeforeepoch" || s == L"beforeepoch" || s == L"before")
+            return LearningRateSearchAlgorithm::SearchBeforeEpoch;
+        else if (s == L"adjustafterepoch" || s == L"afterepoch" || s == L"after")
+            return LearningRateSearchAlgorithm::AdjustAfterEpoch;
+        else
+            throw std::invalid_argument("autoAdjustLR: Invalid learning rate search type. Valid values are (None | SearchBeforeEpoch | AdjustAfterEpoch)");
+    }
+
+template<class ElemType>
+    SGD<ElemType>::SGD(const ConfigParameters& configSGD)
+    {
+        ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
+        m_needToNormalizeLRByParallUtterance = false;
+        m_needToNormalizeMomentumByParallUtterance = false;
+        floatargvector learningRatesPerMB = learningRatesPerMBStr;
+
+        ConfigArray learningRatesPerSampleStr = configSGD("learningRatesPerSample", "");
+        floatargvector learningRatesPerSample = learningRatesPerSampleStr;
+
+        std::string executionEngineValue = configSGD("executionEngine", "synchronous");
+
+        // AutoAdjust Parameters
+        ConfigParameters configAALR(configSGD("AutoAdjust", ""));
+        LearningRateSearchAlgorithm autoAdjustLRType = ParseLearningRateSearchType(configAALR("autoAdjustLR", "None"));
+        double reduceLearnRateIfImproveLessThan = configAALR("reduceLearnRateIfImproveLessThan", "0");
+        bool continueReduce = (bool) configAALR("continueReduce", "false");
+        size_t learnRateAdjustInterval = (size_t) configAALR("learnRateAdjustInterval", "1");
+        double learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618");
+        double increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");
+        double learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382");
+
+        // AutoAdjust Auto Adjust Minibatch Parameters
+        bool autoAdjustMinibatch = (bool) configAALR("autoAdjustMinibatch", "false");
+        size_t minibatchSizeTuningFrequency = configAALR("minibatchSizeTuningFrequency", "1");
+        size_t minibatchSizeTuningMax = configAALR("minibatchSizeTuningMax", "1048576");
+        size_t minibatchSearchCriterionErrorMargin = configAALR("minibatchSearchCriterionErrorMargin", "1");
+
+        // the number of minibatches used to search
+        // the learning rate. Its typically set to 10-20% of
+        // the total minibatches in an epoch.
+        ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500");
+        intargvector numMiniBatch4LRSearch = minibatch4LRSearch;
+
+        size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5");
+        size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1");
+        bool loadBestModel = configAALR("loadBestModel", "true");
+        bool useCVSetControlLRIfCVExists = configAALR("UseCVSetControlLRIfCVExists", "true");
+        bool useEvalCriterionControlLR = configAALR("UseEvalCriterionControlLR", "false");
+
+
+        ConfigArray minibatchSize = configSGD("minibatchSize", "256");
+        intargvector mbSize = minibatchSize;
+
+        // the number of samples in each epoch (0 means, use all the samples in each epoch).
+        size_t epochSize = configSGD("epochSize", "0");
+
+        // the total number of epochs to run.
+        size_t maxEpochs = configSGD("maxEpochs");
+
+        ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
+        floatargvector momentumPerMB = momentumPerMBStr;
+
+        ConfigArray momentumPerSampleStr = configSGD("momentumPerSample", "");
+        floatargvector momentumPerSample = momentumPerSampleStr;
+
+        wstring modelPath = configSGD("modelPath");
+        wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
+        wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
+
+        size_t maxTempMemSizeInSamplesForCNN = configSGD("maxTempMemSizeInSamplesForCNN", "0");
+
+        int traceLevel = configSGD("traceLevel", "0");
+        size_t numMBsToShowResult = configSGD("numMBsToShowResult", "10");
+        size_t numMBsToCUDAProfile = configSGD("numMBsToCUDAProfile", "0");
+
+        bool keepCheckPointFiles = configSGD("keepCheckPointFiles", "false");
+
+        bool gradientClippingWithTruncation = configSGD("gradientClippingWithTruncation", "true");
+        double clippingThresholdPerSample = configSGD("clippingThresholdPerSample", "1#INF");
+
+        ConfigArray dropoutRatesStr = configSGD("dropoutRate", "0.0");
+        floatargvector dropoutRates = dropoutRatesStr;
+
+        GradientUpdateInfo gUpdateInfo;
+        GradientsUpdateType gradUpdateType = ParseGradUpdateType(configSGD("gradUpdateType", "None"));
+        double gaussianNoiseInjecStd = configSGD("gaussianNoiseInjectStd", "0");
+        gUpdateInfo.mType = gradUpdateType;
+        gUpdateInfo.mGaussianNoiseInjectStd = (float) gaussianNoiseInjecStd;
+
+        // extract RMSProp parameters from config, if they exist. Default to reasonable values.
+        RMSPropInfo rpi;
+        rpi.dec = (double) configSGD("rms_wgt_dec", "0.75");
+        rpi.inc = (double) configSGD("rms_wgt_inc", "1.2");
+        rpi.min = (double) configSGD("rms_wgt_min", "0.1");
+        rpi.max = (double) configSGD("rms_wgt_max", "10.0");
+        rpi.gamma = (double) configSGD("rms_gamma", "0.99");
+
+        bool needAveMultiplier = (bool) configSGD("normWithAveMultiplier", "true");
+        double L2RegWeight = (double) configSGD("L2RegWeight", "0");
+        double L1RegWeight = (double) configSGD("L1RegWeight", "0");
+
+        /// for backward support. future setup should use gradUpdateType=AdaGrad, instead of
+        /// useAdagrad=true
+        bool useAdagrad = configSGD("useAdagrad", "false");
+        if (useAdagrad)
+        {
+            gradUpdateType = GradientsUpdateType::AdaGrad;
+            gUpdateInfo.mType = gradUpdateType;
+        }
+
+        AdaptationRegType adaptationRegType = ParseAdaptationRegType(configSGD("adaptationRegType", "None"));
+        double adaptationRegWeight = configSGD("adaptationRegWeight", "0");
+
+        /// gradient check setup
+        bool doGradientCheck = configSGD("gradientcheck", "false");
+        double gradientCheckSigDigit = configSGD("sigFigs", "6");
+
+        if (doGradientCheck && sizeof(ElemType) != sizeof(double))
+            LogicError("Gradient check needs to use precision = double");
+        m_doUnitTest = configSGD("unittest", "false");
+
+        bool validateAfterModelReloading = configSGD("validateAfterModelReloading", "true");
+
+        bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
+
+        // Parallel training
+        m_parallelizationMethod = ParallelizationMethod::None;
+        m_distGradAgg = nullptr;
+        m_gradHeader = nullptr;
+        m_numGradientBits = 32;
+        m_zeroThresholdFor1Bit = true;
+        m_enableDistributedMBReading = false;
+        m_parallelizationStartEpochNum = 0;
+        m_nFramesBetweenMASync = 40000; // default 40k frames 
+
+        if ((g_mpi != nullptr) && configSGD.ExistsCurrent("ParallelTrain"))
+        {
+            ConfigParameters configParallelTrain(configSGD("ParallelTrain", ""));
+            m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain("parallelizationMethod", "None"));
+            m_parallelizationStartEpochNum = configParallelTrain("parallelizationStartEpoch", "1");
+            m_parallelizationStartEpochNum -= 1; // Epoch numbers internally are 0 based
+            m_enableDistributedMBReading = configParallelTrain("distributedMBReading", "false");
+
+            if (configParallelTrain.ExistsCurrent("DataParallelSGD"))
+            {
+                ConfigParameters configDataParallelSGD(configParallelTrain("DataParallelSGD", ""));
+                const char* defaultGradientBitsStr = (sizeof(ElemType) == sizeof(float)) ? "32" : "64";
+                m_numGradientBits = configDataParallelSGD("gradientBits", defaultGradientBitsStr);
+                m_zeroThresholdFor1Bit = configDataParallelSGD("useZeroThresholdFor1BitQuantization", "true");
+                if ((m_numGradientBits < 1) || (m_numGradientBits > (8 * sizeof(ElemType))))
+                {
+                    throw std::invalid_argument("gradientBits must be in the range [1, 32] when using precision=float and in range [1, 64] when using precision=double!");
+                }
+            }
+
+            if (configParallelTrain.ExistsCurrent("ModelAveragingSGD") )
+            {
+                ConfigParameters configMASGD(configParallelTrain("ModelAveragingSGD", "")); 
+                m_nFramesBetweenMASync = configMASGD("SyncFrequencyInFrames", "40000"); 
+                m_iMASyncStatsTrace = configMASGD("MAPerfStats", "0");
+            }
+                
+        }
+
+        // TODO: the number of parameters of this function is waaay to little!
+        Init(learningRatesPerMB,
+             learningRatesPerSample,
+             mbSize,
+             epochSize,
+             maxEpochs,
+             modelPath,
+             momentumPerMB,
+             momentumPerSample,
+             gradientClippingWithTruncation,
+             clippingThresholdPerSample,
+             autoAdjustLRType,
+             increaseLearnRateIfImproveMoreThan,
+             learnRateIncreaseFactor,
+             reduceLearnRateIfImproveLessThan,
+             continueReduce,
+             learnRateDecreaseFactor,
+             dropoutRates,
+             loadBestModel,
+             numMiniBatch4LRSearch,
+             numPrevLearnRates,
+             numBestSearchEpoch,
+             traceLevel,
+             numMBsToShowResult,
+             numMBsToCUDAProfile,
+             maxTempMemSizeInSamplesForCNN,
+             gUpdateInfo,
+             keepCheckPointFiles,
+             adaptationRegType,
+             adaptationRegWeight,
+             trainCriterionNodeName,
+             evalCriterionNodeName,
+             doGradientCheck,
+             gradientCheckSigDigit,
+             validateAfterModelReloading,
+             rpi,
+             learnRateAdjustInterval,
+             UsingAllDataForPreComputedNode,
+             needAveMultiplier,
+             L2RegWeight,
+             L1RegWeight,
+             autoAdjustMinibatch,
+             minibatchSizeTuningFrequency,
+             minibatchSizeTuningMax,
+             useCVSetControlLRIfCVExists,
+             useEvalCriterionControlLR,
+             minibatchSearchCriterionErrorMargin);
+    }
+
+    //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
+    template<class ElemType>
+    void SGD<ElemType>::Init(const floatargvector& learningRatesPerMB,
+              const floatargvector& learningRatesPerSample,
+              const intargvector& mbSize,
+              const size_t epochSize,
+              const size_t maxEpochs,
+              const wstring& modelPath,
+              const floatargvector& momentumPerMB,
+              const floatargvector& momentumPerSample,
+              const bool gradientClippingWithTruncation,
+              const double clippingThresholdPerSample,
+              const LearningRateSearchAlgorithm autoLearnRateSearchType,
+              const double increaseLearnRateIfImproveMoreThan,
+              const double learnRateIncreaseFactor,
+              const double reduceLearnRateIfImproveLessThan,
+              const bool continueReduce,
+              const double learnRateDecreaseFactor,
+              floatargvector dropoutRates,
+              const bool loadBestModel,
+              const intargvector& numMiniBatch4LRSearch,
+              const size_t numPrevLearnRates,
+              const size_t numBestSearchEpoch,
+              const int traceLevel,
+              const size_t numMBsToShowResult,
+              const size_t numMBsToCUDAProfile,
+              const size_t maxTempMemSizeInSamplesForCNN,
+              const GradientUpdateInfo gradUpdateType,
+              const bool keepCheckPointFiles,
+              const AdaptationRegType adaptationRegType,
+              const double adaptationRegWeight,
+              const wstring trainCriterionNodeName,
+              const wstring evalCriterionNodeName,
+              const bool doGradientCheck,
+              const double gradientCheckSigDigit,
+              const bool validateAfterModelReloading,
+              RMSPropInfo rpi,
+              size_t learnRateAdjustInterval,
+              const bool UsingAllDataForPreComputed,
+              const bool needAveMultiplier,
+              const double L2RegWeight,
+              const double L1RegWeight,
+              const bool autoAdjustMinibatch,
+              const size_t minibatchSizeTuningFrequency,
+              const size_t minibatchSizeTuningMax,
+              const bool useCVSetControlLRIfCVExists,
+              const bool useEvalCriterionControlLR,
+              const size_t minibatchSearchCriterionErrorMargin)
+    {
+        m_numPrevLearnRates = numPrevLearnRates;
+        m_prevChosenMinibatchSize = 0;
+        m_autoAdjustMinibatch = autoAdjustMinibatch;
+        m_minibatchSizeTuningMax = minibatchSizeTuningMax;
+        m_minibatchSizeTuningFrequency = minibatchSizeTuningFrequency;
+        m_minibatchSearchCriterionErrorMargin = minibatchSearchCriterionErrorMargin;
+
+        m_mbSize = mbSize;
+
+        // the number of samples in each epoch (0 means, use all the samples in each epoch).
+        m_epochSize = epochSize;
+        if (m_epochSize == 0)
+        {
+            m_epochSize = requestDataSize;
+        }
+
+        // the total number of epochs to run.
+        m_maxEpochs = maxEpochs;
+
+        m_gradientClippingWithTruncation = gradientClippingWithTruncation;
+        m_modelPath = modelPath;
+        m_autoLearnRateSearchType = autoLearnRateSearchType;
+        m_traceLevel = traceLevel;
+        m_loadBestModel = loadBestModel;
+        m_increaseLearnRateIfImproveMoreThan = increaseLearnRateIfImproveMoreThan;
+        m_learnRateIncreaseFactor = learnRateIncreaseFactor;
+        m_reduceLearnRateIfImproveLessThan = reduceLearnRateIfImproveLessThan;
+        m_continueReduce = continueReduce;
+
+        //minimum interval is 1 epoch
+        m_learnRateAdjustInterval = max((size_t) 1, learnRateAdjustInterval);
+
+        m_learnRateDecreaseFactor = learnRateDecreaseFactor;
+        m_clippingThresholdPerSample = abs(clippingThresholdPerSample);
+        m_numMiniBatch4LRSearch = numMiniBatch4LRSearch;
+        m_dropoutRates = dropoutRates;
+        m_numMBsToShowResult = int(numMBsToShowResult);
+        m_numMBsToCUDAProfile = int(numMBsToCUDAProfile);
+        m_numBestSearchEpoch = numBestSearchEpoch;
+        m_maxTempMemSizeInSamplesForCNN = maxTempMemSizeInSamplesForCNN;
+        m_gradType = gradUpdateType;
+        m_rpi = rpi;
+        m_keepCheckPointFiles = keepCheckPointFiles;
+
+        m_adaptationRegType = adaptationRegType;
+        m_adaptationRegWeight = adaptationRegWeight;
+
+        m_trainCriterionNodeName = trainCriterionNodeName;
+        m_evalCriterionNodeName = evalCriterionNodeName;
+        m_useAllDataForPreComputedNode = UsingAllDataForPreComputed;
+
+        m_needAveMultiplier = needAveMultiplier;
+        m_L2RegWeight = L2RegWeight;
+        m_L1RegWeight = L1RegWeight;
+
+        for (size_t i = 0; i < m_mbSize.size(); i++)
+        {
+            if (m_epochSize != requestDataSize && m_epochSize < m_mbSize[i])
+            {
+                throw std::invalid_argument("epoch size must be larger than mbsize.");
+            }
+        }
+
+        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None &&
+            (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
+        {
+            throw std::invalid_argument("If autoLearnRateSearchType is false "
+                                        "you must specify the learningRatesPerSample "
+                                        "or learningRatesPerMB parameter.");
+        }
+
+        if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
+        {
+            throw std::invalid_argument("You specified both learningRatesPerSample "
+                                        "and learningRatesPerMB. Please comment "
+                                        "out one of them.");
+        }
+        else if (learningRatesPerSample.size() > 0)
+        {
+            m_learningRatesPerSample = learningRatesPerSample;
+        }
+        else if (learningRatesPerMB.size() > 0)
+        {
+            int LRSize = (int) max(learningRatesPerMB.size(), m_mbSize.size());
+            m_learningRatesPerSample.resize(LRSize);
+            for (int i = 0; i < LRSize; i++)
+            {
+                m_learningRatesPerSample[i] = learningRatesPerMB[i] / m_mbSize[i];
+            }
+            m_needToNormalizeLRByParallUtterance = true;
+        }
+
+        if (momentumPerSample.size() > 0 && momentumPerMB.size() > 0)
+        {
+            throw std::invalid_argument("You specified both momentumPerSample "
+                                        "and momentumPerMB. Please comment "
+                                        "out one of them.");
+        }
+        else if (momentumPerSample.size() > 0)
+        {
+            m_momentumPerSample = momentumPerSample;
+            int momentumVectorSize = m_momentumPerSample.size();
+            for (int i = 0; i < momentumVectorSize; i++)
+            {
+                if ((m_momentumPerSample[i] >= 1) || (m_momentumPerSample[i] < 0))
+                {
+                    throw std::invalid_argument("momentumPerSample must be in [0, 1).");
+                }
+            }
+        }
+        else if (momentumPerMB.size() > 0)
+        {
+            int momentumVectorSize = (int)max(momentumPerMB.size(), m_mbSize.size());
+            m_momentumPerSample.resize(momentumVectorSize);
+            for (int i = 0; i < momentumVectorSize; i++)
+            {
+                if ((momentumPerMB[i] >= 1) || (momentumPerMB[i] < 0))
+                    InvalidArgument("momentumPerMB must be in [0, 1).");
+                m_momentumPerSample[i] = (float)pow(momentumPerMB[i], 1.0 / m_mbSize[i]); 
+            }
+
+            m_needToNormalizeMomentumByParallUtterance = true;
+        }
+        else
+        {
+            int momentumVectorSize = m_mbSize.size();
+            m_momentumPerSample.resize(momentumVectorSize);
+            for (int i = 0; i < momentumVectorSize; i++)
+                m_momentumPerSample[i] = (float)pow(0.9f, 1.0 / m_mbSize[i]);
+            }
+
+        if (m_learnRateDecreaseFactor > 1 || m_learnRateIncreaseFactor < 1)
+            InvalidArgument("learnRateIncreaseFactor must be >= 1 and learnRateDecreaseFactor must be <= 1.");
+
+        for (size_t i = 0; i < m_dropoutRates.size(); i++)
+            if (m_dropoutRates[i] >= 1 || m_dropoutRates[i] < 0)
+                InvalidArgument("dropoutRate must be >= 0 and < 1.");
+
+        if (m_adaptationRegWeight > 1 || m_adaptationRegWeight < 0)
+            InvalidArgument("adaptationRegWeight must be in [0 1]");
+
+        m_minLearnRate = 1e-9f;
+
+        m_needAdaptRegularization = false;
+
+        m_doGradientCheck = doGradientCheck;
+        m_gradientCheckSigDigit = gradientCheckSigDigit;
+        m_validateAfterModelReloading = validateAfterModelReloading;
+
+        m_useCVSetControlLRIfCVExists = useCVSetControlLRIfCVExists;
+        m_useEvalCriterionControlLR = useEvalCriterionControlLR;
+
+        msra::files::make_intermediate_dirs(m_modelPath);
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::Adapt(wstring origModelFileName, wstring refNodeName,
+               IDataReader<ElemType>* trainSetDataReader,
+               IDataReader<ElemType>* validationSetDataReader,
+               const DEVICEID_TYPE deviceID, const bool makeMode)
+    {
+        if (origModelFileName == L"" || trainSetDataReader == nullptr)
+            InvalidArgument("origModel and trainSetDataReader should not be null.");
+
+        int startEpoch = DetermineStartEpoch(makeMode);
+        if (startEpoch == m_maxEpochs)
+        {
+            fprintf(stderr, "Final model exists. No further training is necessary.\n");
+            return;
+        }
+
+        ComputationNetwork net(deviceID);
+        if (startEpoch >= 0)
+        {
+            wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
+            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+            net.LoadFromFile<ElemType>(modelFileName);
+        }
+        else
+        {
+            fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
+            net.LoadFromFile<ElemType>(origModelFileName);
+        }
+
+        startEpoch = max(startEpoch, 0);
+
+        ComputationNetwork refNet(deviceID);
+        m_needAdaptRegularization = m_adaptationRegType != AdaptationRegType::None && m_adaptationRegWeight > 0;
+        if (m_needAdaptRegularization)
+        {
+            fprintf(stderr, "Load reference Network From the original model file %ls.\n", origModelFileName.c_str());
+            refNet.LoadFromFile<ElemType>(origModelFileName);
+        }
+
+        ComputationNodeBasePtr refNode;
+        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL)
+        {
+            fprintf(stderr, "Checking refNodeName %ls.\n", origModelFileName.c_str());
+            if (refNodeName == L"")
+                InvalidArgument("refNodeName does not exist and is needed when adaptationRegType is KL.");
+            refNode = refNet.GetNodeFromName(refNodeName);
+        }
+
+        TrainOrAdaptModel(startEpoch, net, refNet, refNode, trainSetDataReader, validationSetDataReader);
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::SequenceTrain(IComputationNetBuilder<ElemType>* netBuilder, wstring origModelFileName,
+                       IDataReader<ElemType>* trainSetDataReader, IDataReader<ElemType>* validationSetDataReader,
+                       const DEVICEID_TYPE deviceID, const bool makeMode)
+    {
+        if (netBuilder == nullptr || origModelFileName == L"" || trainSetDataReader == nullptr)
+            InvalidArgument("netBuilder, origModel and trainSetDataReader should not be null.");
+
+        int startEpoch = DetermineStartEpoch(makeMode);
+        if (startEpoch == m_maxEpochs)
+        {
+            fprintf(stderr, "Final model exists. No further training is necessary.\n");
+            return;
+        }
+
+        // Initializes the model from original model.
+        ComputationNetwork origNet(deviceID);
+        ComputationNetwork* sequenceNet = 
+            (startEpoch < 0) ? netBuilder->BuildNetworkFromDescription() : &origNet;
+        std::vector<ComputationNodeBasePtr> addedFeatureNodes;
+        std::vector<ComputationNodeBasePtr> replacedCriterionNodes;
+        if (startEpoch < 0)
+        {
+            // Loads models.
+            origNet.LoadFromFile<ElemType>(origModelFileName);
+
+            // Processes feature nodes.
+            std::vector<ComputationNodeBasePtr> & sequenceFeatureNodes = sequenceNet->FeatureNodes();
+            for (size_t i = 0; i < sequenceFeatureNodes.size(); ++i)
+            {
+                if (!origNet.NodeNameExist(sequenceFeatureNodes[i]->NodeName()))
+                {
+                    addedFeatureNodes.push_back(sequenceFeatureNodes[i]);
+                    origNet.AddFeatureNode(sequenceFeatureNodes[i]);
+                }
+            }
+
+            // Processes criterion nodes.
+            auto & origCriterionNodes = GetTrainCriterionNodes(origNet);
+            auto & sequenceCriterionNodes = GetTrainCriterionNodes(*sequenceNet);
+            if (origCriterionNodes.size() == 0 || sequenceCriterionNodes.size() == 0)
+            {
+                throw std::runtime_error("Training criterion node does not exist.");
+            }
+            replacedCriterionNodes.push_back(origCriterionNodes[0]);
+            origNet.ReplaceFinalCriterionNode(origCriterionNodes[0]->NodeName(), sequenceCriterionNodes[0]);
+            origNet.ResetEvalTimeStamp();
+        }
+
+        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
+        if (startEpoch >= 0)
+            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+        else
+            fprintf(stderr, "Load Network From the original model file %ls.\n", origModelFileName.c_str());
+        ComputationNetwork *net = (startEpoch < 0) ? &origNet : netBuilder->LoadNetworkFromFile(modelFileName);
+
+        startEpoch = max(startEpoch, 0);
+
+        TrainOrAdaptModel(startEpoch, *net, *net, nullptr, trainSetDataReader, validationSetDataReader);
+
+        // Handles deletions carefully here.
+        if (startEpoch < 0)
+        {
+            for (size_t i = 0; i < addedFeatureNodes.size(); ++i)
+                origNet.RemoveFeatureNode(addedFeatureNodes[i]);
+            auto & origCriterionNodes = GetTrainCriterionNodes(origNet);
+            origNet.ReplaceFinalCriterionNode(origCriterionNodes[0]->NodeName(), replacedCriterionNodes[0]);
+        }
+    }
+
+    static double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+    {
+        return pow(momentumPerSample, minibatchSize);
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::Train(IComputationNetBuilder<ElemType>* netBuilder,
+               IDataReader<ElemType>* trainSetDataReader,
+               IDataReader<ElemType>* validationSetDataReader,
+               const bool makeMode)
+    {
+        if (netBuilder == nullptr || trainSetDataReader == nullptr)
+            InvalidArgument("netBuilder and trainSetDataReader should not be null.\n");
+        int startEpoch = DetermineStartEpoch(makeMode);
+        if (startEpoch == m_maxEpochs)
+        {
+            fprintf(stderr, "Final model exists. No further training is necessary.\n");
+            return;
+        }
+
+        wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
+        if (startEpoch >= 0)
+            fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
+
+        ComputationNetwork* net = startEpoch < 0 ? netBuilder->BuildNetworkFromDescription() :
+                                                             netBuilder->LoadNetworkFromFile(modelFileName);
+        // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
+        // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters.
+
+        /*  if (m_doUnitTest)
+        {
+            if (net.UnitTest() == false)
+                LogicError("unit test on decoder network not passed");
+
+            return;
+        }*/
+
+        startEpoch = max(startEpoch, 0);
+        m_needAdaptRegularization = false;
+
+        TrainOrAdaptModel(startEpoch, *net, *net, nullptr, trainSetDataReader, validationSetDataReader);
+    }
+
+// protected:
+
+    // Get{Train,Eval}CriterionNodes() return a reference that is, unfortunately, dependent on the network.
+    // So we hold those inside here. Not very nice. Also not thread-safe. This may go away once we fix sequence-to-sequence models properly.
+    static map<ComputationNetwork*, vector<ComputationNodeBasePtr>> tmpCriterionNodeSets;
+    // TODO: test this, then remove this comment
+
+    template<class ElemType>
+    std::vector<ComputationNodeBasePtr> & SGD<ElemType>::GetTrainCriterionNodes(ComputationNetwork& net)
+    {
+        fprintf(stderr, "GetTrainCriterionNodes %ls ...\n", m_trainCriterionNodeName.c_str());
+        if (!m_trainCriterionNodeName.empty())
+        {
+            tmpCriterionNodeSets[&net] = net.CriterionNodesFrom(m_trainCriterionNodeName);
+            return tmpCriterionNodeSets[&net];
+        }
+        else
+            return net.FinalCriterionNodes();
+    }
+
+    template<class ElemType>
+    std::vector<ComputationNodeBasePtr> & SGD<ElemType>::GetEvalCriterionNodes(ComputationNetwork& net)
+    {
+        fprintf(stderr, "GetEvalCriterionNodes %ls ...\n", m_evalCriterionNodeName.c_str());
+        if (!m_evalCriterionNodeName.empty())
+        {
+            tmpCriterionNodeSets[&net] = net.CriterionNodesFrom(m_evalCriterionNodeName);
+            return tmpCriterionNodeSets[&net];
+        }
+        else
+            return net.EvaluationNodes();
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetwork& net,
+                           ComputationNetwork& refNet,
+                           ComputationNodeBasePtr refNode,
+                           IDataReader<ElemType>* trainSetDataReader,
+                           IDataReader<ElemType>* validationSetDataReader)
+    {
+        auto & featureNodes = net.FeatureNodes();
+        auto & labelNodes = net.LabelNodes();
+        auto & criterionNodes = GetTrainCriterionNodes(net);
+        auto & evaluationNodes = GetEvalCriterionNodes(net);
+
+        std::map<std::wstring, Matrix<ElemType>*>* inputMatrices = new std::map<std::wstring, Matrix<ElemType>*>();
+        for (size_t i = 0; i < featureNodes.size(); i++)
+        {
+            // TODO: instead, remember the nodes directly, to be able to handle both float and double nodes; current version will crash for mixed networks
+            (*inputMatrices)[featureNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(featureNodes[i])->FunctionValues();
+        }
+
+        for (size_t i = 0; i < labelNodes.size(); i++)
+        {
+            (*inputMatrices)[labelNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[i])->FunctionValues();
+        }
+
+        // used for KLD regularized adaptation. For all other adaptation techniques
+        // use MEL to edit the model and using normal training algorithm
+        std::vector<ComputationNodeBasePtr> refFeatureNodes;
+        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+        {
+            refFeatureNodes.resize(featureNodes.size());
+            for (size_t i = 0; i < featureNodes.size(); i++)
+            {
+                //we need to keep this info to handle deletion
+                refFeatureNodes[i] = refNet.GetNodeFromName(featureNodes[i]->NodeName());
+                refNet.ChangeNode(featureNodes[i]->NodeName(), featureNodes[i]);
+            }
+
+            refNet.RebuildNetwork(refNode);
+        }
+
+        //initializing weights and gradient holder
+        //only one criterion so far TODO: support multiple ones?
+        auto & learnableNodes = net.LearnableNodes(criterionNodes[0]);
+        std::list<Matrix<ElemType>> smoothedGradients;
+
+        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+        {
+            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+            smoothedGradients.push_back(Matrix<ElemType>(node->FunctionValues().GetNumRows(),
+                                                         node->FunctionValues().GetNumCols(),
+                                                         net.GetDeviceId()));
+        }
+
+        double epochCriterion, avgCriterion, prevCriterion, lrControlCriterion;
+        lrControlCriterion = epochCriterion = avgCriterion = prevCriterion = std::numeric_limits<double>::infinity();
+        size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
+
+        std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+
+        std::vector<wstring> evalNodeNames;
+        for (size_t i = 0; i < evaluationNodes.size(); i++)
+            evalNodeNames.push_back(evaluationNodes[i]->NodeName());
+
+        size_t totalSamplesSeen = 0;
+        double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
+
+        double learningRateAdjustmentFactor = 1.0f;
+        vector<double> prevLearnRates;
+        prevLearnRates.resize(m_numPrevLearnRates);
+        for (int i = 0; i < m_numPrevLearnRates; i++)
+             prevLearnRates[i] = -1.0;
+
+        //precompute mean and invStdDev nodes and save initial model
+        if (PreCompute(net, trainSetDataReader, featureNodes, labelNodes, inputMatrices) || startEpoch == 0)
+        {
+            // Synchronize all ranks before writing the model to ensure that 
+            // everyone is done loading the model
+            if (g_mpi != nullptr)
+                g_mpi->WaitAll();
+
+            net.SaveToFile(GetModelNameForEpoch(int(startEpoch) - 1));
+        }
+
+        // first, we need to normalize the effect of nbruttsineachrecurrentiter
+        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeLRByParallUtterance)
+        {
+            for (auto& x : m_learningRatesPerSample)
+                x /= (float)trainSetDataReader->GetNumParallelSequences();
+        }
+        
+        // first, we need to normalize the effect of nbruttsineachrecurrentiter for momemtum
+        if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
+        {
+            for (auto& x : m_momentumPerSample)
+                x = (float)pow(x, 1.0 / trainSetDataReader->GetNumParallelSequences());
+        }
+
+        bool learnRateInitialized = false;
+        if (startEpoch > 0)
+        {
+            learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
+                                                      /*out*/ totalSamplesSeen,
+                                                      /*out*/ learnRatePerSample,
+                                                      smoothedGradients,
+                                                      /*out*/ prevCriterion,
+                                                      /*out*/ m_prevChosenMinibatchSize);
+            if (learnRateInitialized)
+                prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
+            }
+
+        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
+            !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
+        {
+            InvalidArgument(
+                "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, "
+                "or an explicit learning rate must be specified in config for the starting epoch.");
+        }
+
+        unsigned long dropOutSeed = 1;
+        double prevDropoutRate = 0;
+
+        bool learnRateReduced = false;
+
+        ComputationNetwork::SetMaxTempMemSizeForCNN(net, criterionNodes[0], m_maxTempMemSizeInSamplesForCNN);
+        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+            ComputationNetwork::SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
+
+        // --- MAIN EPOCH LOOP
+
+        for (int i = startEpoch; i < (int)m_maxEpochs; i++)
+        {
+            // Synchronize all ranks before proceeding to ensure that 
+            // rank 0 has finished writing the previous model file
+            if (g_mpi != nullptr)
+                g_mpi->WaitAll();
+
+            Timer timer;
+            timer.Start();
+
+            // set dropout rate
+            ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
+
+            // learning rate adjustment
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None ||
+                (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
+            {
+                learnRatePerSample = m_learningRatesPerSample[i];
+            }
+            else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+            {
+                double largestPrevLearnRatePerSample = prevLearnRates[0];
+                for (int j = 1; j < m_numPrevLearnRates; j++)
+                    largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
+
+                // return a reasonable learning rate based on the initial minibatchSize
+                double newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample,
+                                                                           trainSetDataReader, featureNodes, labelNodes,
+                                                                           criterionNodes, evaluationNodes, inputMatrices,
+                                                                           learnableNodes, smoothedGradients,
+                                                                           learnRateInitialized, largestPrevLearnRatePerSample);
+                learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample;
+                learnRatePerSample = newLearningRatePerSample;
+
+                // save per sample learn rate to support changeable minibatchSize
+                prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;
+            }
+
+            learnRateInitialized = true;
+
+            if (learnRatePerSample < m_minLearnRate)
+            {
+                fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n",
+                        i + 1, learnRatePerSample, m_minLearnRate);
+                if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
+                    net.SaveToFile(m_modelPath);
+                break;
+            }
+
+            size_t chosenMinibatchSize;
+            size_t actualMinibatchSize;
+
+            // Through the command line or config file the user can set minibatch sizes on a per epoch
+            // basis for a set number of epochs.  For epochs after that point, m_mbSize.size(), either
+            // we just keep using
+            // the last minibatch size, or we use tuning to try and find a better one.
+            if (m_autoAdjustMinibatch && i >= m_mbSize.size())
+            {
+                size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i];
+                if (m_epochSize != requestDataSize)
+                {
+                    // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+                    numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
+                }
+
+                // Use tuning to try and find a better minibatch size
+                chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i,
+                                                              numFramesToUseInSearch,
+                                                              trainSetDataReader, learnRatePerSample,
+                                                              m_mbSize[i], featureNodes, labelNodes,
+                                                              criterionNodes, evaluationNodes,
+                                                              inputMatrices, learnableNodes,
+                                                              smoothedGradients, learningRateAdjustmentFactor);
+                m_prevChosenMinibatchSize = chosenMinibatchSize;
+            }
+            else
+            {
+                // use the explicitly set minibatch size
+                chosenMinibatchSize = m_mbSize[i];
+            }
+            
+            actualMinibatchSize = chosenMinibatchSize;
+            if (trainSetDataReader->GetNumParallelSequences() > 1 && m_needToNormalizeMomentumByParallUtterance)
+                actualMinibatchSize = chosenMinibatchSize * trainSetDataReader->GetNumParallelSequences();
+
+            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
+                    i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], actualMinibatchSize));
+
+            TrainOneEpoch(net,
+                          refNet, 
+                          refNode, 
+                          i, 
+                          m_epochSize,
+                          trainSetDataReader, 
+                          learnRatePerSample, 
+                          chosenMinibatchSize, 
+                          featureNodes,
+                          labelNodes, 
+                          criterionNodes, 
+                          evaluationNodes,
+                          inputMatrices, 
+                          learnableNodes, smoothedGradients,
+                          epochCriterion, epochEvalErrors, totalSamplesSeen);
+
+            timer.Stop();
+            double epochTime = timer.ElapsedSeconds();
+
+            if (m_useEvalCriterionControlLR)
+                lrControlCriterion = epochEvalErrors[0];
+            else
+                lrControlCriterion = epochCriterion;
+
+            fprintf(stderr,
+                    "Finished Epoch[%d]: [Training Set] TrainLossPerSample = %.8g; ",
+                    i + 1, epochCriterion);
+            if (epochEvalErrors.size() == 1)
+            {
+                fprintf(stderr,
+                        "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n",
+                        epochEvalErrors[0], learnRatePerSample, epochTime);
+            }
+            else
+            {
+                fprintf(stderr, "EvalErrPerSample ");
+                for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                    fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
+
+                fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n",
+                        learnRatePerSample, epochTime);
+
+                fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n",
+                                i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion);
+
+                for (size_t j = 0; j < epochEvalErrors.size(); j++)
+                {
+                    fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n",
+                            i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
+                }
+            }
+
+            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+            {
+                if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
+                {
+                    SimpleEvaluator<ElemType> evalforvalidation(net);
+                    vector<wstring> cvSetTrainAndEvalNodes;
+                    cvSetTrainAndEvalNodes.push_back(criterionNodes[0]->NodeName());
+                    cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
+
+                    vector<double> vScore = evalforvalidation.Evaluate(validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
+                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n",
+                            i + 1, vScore[0], vScore[1]);
+
+                    if (m_useCVSetControlLRIfCVExists)
+                    {
+                        if (m_useEvalCriterionControlLR)
+                            lrControlCriterion = vScore[1];
+                        else
+                            lrControlCriterion = vScore[0]; //the first one is the training criterion.
+                        }
+                    }
+                }
+
+            // broadcast epochCriterion to make sure each processor will have the same learning rate schedule
+            if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
+                g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
+
+            bool loadedPrevModel = false;
+            size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
+            if (avgCriterion == std::numeric_limits<double>::infinity())
+            {
+                avgCriterion = lrControlCriterion;
+            }
+            else
+            {
+                avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) *
+                    avgCriterion + lrControlCriterion) /
+                    (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
+            }
+
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
+                m_learningRatesPerSample.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+            {
+                if (std::isnan(avgCriterion) || (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits<double>::infinity()))
+                {
+                    if (m_loadBestModel)
+                    {
+                        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i - 1),
+                                                              m_validateAfterModelReloading);
+                        net.ResetEvalTimeStamp();
+                        LoadCheckPointInfo(i - 1,
+                                           /*out*/ totalSamplesSeen,
+                                           /*out*/ learnRatePerSample,
+                                           smoothedGradients,
+                                           /*out*/ prevCriterion,
+                                           /*out*/ m_prevChosenMinibatchSize);
+                        fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
+                        loadedPrevModel = true;
+                    }
+                }
+
+                if (m_continueReduce)
+                {
+                    if (std::isnan(avgCriterion) || 
+                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
+                        prevCriterion != std::numeric_limits<double>::infinity()))
+                    {
+                        if (learnRateReduced == false)
+                            learnRateReduced = true;
+                        else
+                        {
+                            net.SaveToFile(GetModelNameForEpoch(i, true));
+
+                            fprintf(stderr, "Finished training and saved final model\n\n");
+                            break;
+                        }
+                    }
+
+                    if (learnRateReduced)
+                    {
+                        learnRatePerSample *= m_learnRateDecreaseFactor;
+                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                    }
+                }
+                else
+                {
+                    if (std::isnan(avgCriterion) || 
+                        (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion &&
+                        prevCriterion != std::numeric_limits<double>::infinity()))
+                    {
+
+                        learnRatePerSample *= m_learnRateDecreaseFactor;
+                        fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
+                    }
+                    else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion &&
+                             prevCriterion != std::numeric_limits<double>::infinity())
+                    {
+                        learnRatePerSample *= m_learnRateIncreaseFactor;
+                        fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
+                    }
+                }
+            }
+            else
+            {
+                if (std::isnan(avgCriterion))
+                    RuntimeError("The training criterion is not a number (NAN). Stop\n");
+            }
+
+            // not loading previous values then set them
+            if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
+            {
+                prevCriterion = avgCriterion;
+                epochsNotCountedInAvgCriterion = 0;
+            }
+
+            // Synchronize all ranks before proceeding to ensure that 
+            // nobody tries reading the checkpoint file at the same time
+            // as rank 0 deleting it below
+            if (g_mpi != nullptr)
+                g_mpi->WaitAll();
+
+            // persist model and check-point info
+            if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+            {
+                net.SaveToFile(GetModelNameForEpoch(i));
+                SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
+                if (!m_keepCheckPointFiles)
+                {
+                    // delete previous checkpoint file to save space
+                    _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str());
+                }
+            }
+
+            if (learnRatePerSample < 1e-12)
+            {
+                fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
+                        learnRatePerSample);
+            }
+        }
+
+        // --- END OF MAIN EPOCH LOOP
+
+        // since we linked feature nodes. we need to remove it from the deletion
+        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+        {
+            for (size_t i = 0; i < refFeatureNodes.size(); i++)
+            {
+                // note we need to handle deletion carefully
+                refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]);
+            }
+        }
+
+        delete inputMatrices;
+    }
+
+// protected:
+
+    // return true if precomputation is executed.
+    template<class ElemType>
+    bool SGD<ElemType>::PreCompute(ComputationNetwork& net,
+                    IDataReader<ElemType>* trainSetDataReader,
+                    std::vector<ComputationNodeBasePtr> & featureNodes,
+                    std::vector<ComputationNodeBasePtr> & labelNodes,
+                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
+    {
+        std::list<ComputationNodeBasePtr> nodes = net.GetNodesRequiringPreComputation();
+
+        if (nodes.size() == 0)
+        {
+            fprintf(stderr, "No PreCompute nodes found, skipping PreCompute step\n");
+            return false;
+        }
+
+        fprintf(stderr, "Found %lu PreCompute nodes\n", nodes.size());
+        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+        {
+            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
+        }
+
+        //compute
+        //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
+        // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
+        // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's,
+        // so we need to use all the data to do precomputing
+        if (m_useAllDataForPreComputedNode)
+        {
+            // using all the data
+            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
+        }
+        else
+        {
+            // using all the data
+            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
+        }
+
+        while (trainSetDataReader->GetMinibatch(*inputMatrices))
+        {
+            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
+            ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
+
+            net.SetActualMiniBatchSizeFromFeatures();
+            trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+
+            // TODO: Exactly this loop should be INSIDE ComputationNetwork--pass the nodes array instead!
+            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+                net.Evaluate(*nodeIter);
+        }
+
+        // mark done
+        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+        {
+            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            node->MarkComputed(true);
+        }
+
+        return true;
+    }
+
+    // return a reasonable initial learning rate based on the initial mbsize
+    template<class ElemType>
+    double SGD<ElemType>::SearchForBestLearnRate(ComputationNetwork& net,
+                                    ComputationNetwork& refNet,
+                                    const ComputationNodeBasePtr refNode, const int epochNumber,
+                                  const double curLearnRate,
+                                    IDataReader<ElemType>* trainSetDataReader,
+                                    const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                    const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                    const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                    const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                    const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                    std::list<Matrix<ElemType>>& smoothedGradients,
+                                    const bool learnRateInitialized,
+                                  const double largestPrevLearnRatePerSample)
+    {
+        double epochCriterion = std::numeric_limits<double>::infinity();
+        double prevCriterion = std::numeric_limits<double>::infinity();
+        vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+
+        size_t totalSamplesSeen = 0;
+        double bestLearnRatePerSample = curLearnRate;
+
+        size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
+        if (m_epochSize != requestDataSize)
+        {
+            // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+            numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
+        }
+
+        double baseCriterion;
+
+        double minLearnRate = m_minLearnRate * 0.3f;
+        double learnRatePerSample = 1.0f / 8.0f / 0.618f / sqrt((double)m_mbSize[epochNumber]);
+
+        if (learnRateInitialized && largestPrevLearnRatePerSample > 0)
+        {
+            //largestPrevLearnRatePerSample is per sample, first 0.618f is for compensation, second one is for safety
+            learnRatePerSample = largestPrevLearnRatePerSample / 0.618f / 0.618f;
+        }
+
+        int baseModelEpoch = epochNumber - 1;
+        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
+        net.ResetEvalTimeStamp();
+
+        double learnRate = learnRatePerSample;
+        size_t dummyMinibatchSize = 0;
+        LoadCheckPointInfo(baseModelEpoch,
+                           /*out*/ totalSamplesSeen,
+                           /*out*/ learnRate,
+                           smoothedGradients,
+                           /*out*/ prevCriterion,
+                           /*out*/ dummyMinibatchSize);
+
+        // if model is not changed this is what we will get
+        TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                        numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber],
+                                        featureNodes, labelNodes,
+                                        criterionNodes, evaluationNodes,
+                                        inputMatrices, learnableNodes,
+                                        smoothedGradients, /*out*/ baseCriterion,
+                                        /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                        "BaseAdaptiveLearnRateSearch:");
+
+        if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
+        {
+            if (prevCriterion == std::numeric_limits<double>::infinity())
+                prevCriterion = baseCriterion;
+
+            double ratio = 0.3;
+
+            if (m_epochSize != requestDataSize)
+                ratio = pow(((double)numFramesToUseInSearch) / m_epochSize, 1.0f / 2);
+
+            baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
+        }
+
+        do
+        {
+            learnRatePerSample *= 0.618;
+            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            learnRatePerSample, m_mbSize[epochNumber], featureNodes,
+                                            labelNodes, criterionNodes,
+                                            evaluationNodes, inputMatrices,
+                                            learnableNodes, smoothedGradients,
+                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                            /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
+
+                    } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
+
+        bestLearnRatePerSample = learnRatePerSample;
+
+        //grid search for the first m_numBestSearchEpoch  epochs
+        if (epochNumber < m_numBestSearchEpoch)
+        {
+            double leftLearnRatePerSample = 0.01 / m_mbSize[epochNumber];
+            double rightLearnRatePerSample = learnRatePerSample;
+            double leftCriterion, rightCriterion = epochCriterion;
+
+            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            leftLearnRatePerSample, m_mbSize[epochNumber],
+                                            featureNodes, labelNodes,
+                                            criterionNodes, evaluationNodes,
+                                            inputMatrices, learnableNodes,
+                                            smoothedGradients, /*out*/ leftCriterion,
+                                            /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                            "DetailBaseAdaptiveLearnRateSearch:");
+
+            while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2)
+            {
+                if (rightCriterion > leftCriterion)
+                {
+                    rightLearnRatePerSample *= 0.618;
+
+                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
+                                                    epochNumber, numFramesToUseInSearch,
+                                                    trainSetDataReader,
+                                                    rightLearnRatePerSample, m_mbSize[epochNumber],
+                                                    featureNodes, labelNodes,
+                                                    criterionNodes,
+                                                    evaluationNodes,
+                                                    inputMatrices,
+                                                    learnableNodes,
+                                                    smoothedGradients,
+                                                    /*out*/ rightCriterion,
+                                                    /*out*/ epochEvalErrors,
+                                                    /*out*/ totalSamplesSeen,
+                                                    "DetailRightAdaptiveLearnRateSearch:");
+                }
+                else
+                {
+                    leftLearnRatePerSample /= 0.618;
+
+                    TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
+                                                    epochNumber, numFramesToUseInSearch,
+                                                    trainSetDataReader,
+                                                    leftLearnRatePerSample, m_mbSize[epochNumber],
+                                                    featureNodes, labelNodes,
+                                                    criterionNodes,
+                                                    evaluationNodes,
+                                                    inputMatrices,
+                                                    learnableNodes,
+                                                    smoothedGradients,
+                                                    /*out*/ leftCriterion,
+                                                    /*out*/ epochEvalErrors,
+                                                    /*out*/ totalSamplesSeen,
+                                                    "DetailLeftAdaptiveLearnRateSearch:");
+                }
+            }
+
+            bestLearnRatePerSample = (leftCriterion < rightCriterion) ? leftLearnRatePerSample :
+                                                                        rightLearnRatePerSample;
+        }
+
+        fprintf(stderr, "Best Learn Rate Per Sample for Epoch[%d] = %.10g  baseCriterion=%.10g\n",
+                epochNumber + 1, bestLearnRatePerSample, baseCriterion);
+
+        return bestLearnRatePerSample;
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetwork& net,
+                                         ComputationNetwork& refNet,
+                                         const ComputationNodeBasePtr refNode, const int epochNumber,
+                                         const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
+                                         const double learnRatePerSample,
+                                         const size_t minibatchSize,
+                                         const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                         const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                         const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                         std::list<Matrix<ElemType>>& smoothedGradients,
+                                         /*out*/ double& epochCriterion,
+                                         /*out*/ std::vector<double>& epochEvalErrors,
+                                         /*out*/ size_t& totalSamplesSeen,
+                                         std::string prefixMsg)
+    {
+        TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
+                      trainSetDataReader, learnRatePerSample, minibatchSize, featureNodes,
+                      labelNodes, criterionNodes, evaluationNodes,
+                      inputMatrices, learnableNodes, smoothedGradients,
+                      /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                      prefixMsg);
+
+        fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion);
+
+        if (epochEvalErrors.size() == 1)
+            fprintf(stderr, "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g\n", epochEvalErrors[0], learnRatePerSample);
+        else
+        {
+            fprintf(stderr, "EvalErrPerSample ");
+            for (size_t i = 0; i < epochEvalErrors.size(); i++)
+                fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
+            fprintf(stderr, "Ave LearnRatePerSample = %.10g\n", learnRatePerSample);
+        }
+
+        int baseModelEpoch = epochNumber - 1;
+        net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
+        net.ResetEvalTimeStamp();
+
+        double dummyLearnRate;
+        double dummtPrevCriterion;
+        size_t dummyMinibatchSize = 0;
+        LoadCheckPointInfo(baseModelEpoch,
+                           /*out*/ totalSamplesSeen,
+                           /*out*/ dummyLearnRate,
+                           smoothedGradients,
+                           /*out*/ dummtPrevCriterion,
+                           /*out*/ dummyMinibatchSize);
+    }
+
+    template<class ElemType>
+    size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetwork& net,
+                                   ComputationNetwork& refNet,
+                                   const ComputationNodeBasePtr refNode,
+                                   const int epochNumber,
+                                   const size_t numFramesToUseInSearch,
+                                   IDataReader<ElemType>* trainSetDataReader,
+                                   const double learnRatePerSample,
+                                   const size_t initialMinibatchSize,
+                                   const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                   const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                   const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                   const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                   std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                   const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                   std::list<Matrix<ElemType>>& smoothedGradients,
+                                   const double learningRateAdjustmentFactor)
+    {
+        size_t minMinibatchSize = initialMinibatchSize;
+        size_t chosenMinibatchSize = initialMinibatchSize;
+
+        // do some pre-adjustment based on LR
+        // Basically we assume that the LR for epoch 1 is safe for mbsize.
+        // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
+        double learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0];
+        learningRateChangeSoFar *= learningRateAdjustmentFactor;
+
+        // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
+        learningRateChangeSoFar = sqrt(learningRateChangeSoFar);
+
+        // LR was indeed reduced
+        if (learningRateChangeSoFar < 1.0f)
+        {
+            // we can safely increase MB size (note: this may be bigger than our max)
+            minMinibatchSize = (size_t)(minMinibatchSize / learningRateChangeSoFar);
+        }
+
+        if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
+        {
+            // newly started training: any previous MB size stored in the model is to be ignored
+            fprintf(stderr, "before epoch .2, previous minibatchSize %zd is "
+                    "considered invalid -> resetting\n", m_prevChosenMinibatchSize);
+            m_prevChosenMinibatchSize = 0;
+        }
+
+        // check if we need to skip
+        if (m_prevChosenMinibatchSize != 0 &&
+            (epochNumber + 1) > m_minibatchSizeTuningFrequency &&
+            (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
+        {
+            fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
+                    "in epoch %d skipped, keeping minibatchSize of %zd\n",
+                    epochNumber + 1, m_prevChosenMinibatchSize);
+            chosenMinibatchSize = m_prevChosenMinibatchSize;
+        }
+        else
+        {
+            if (m_prevChosenMinibatchSize != 0)
+            {
+                // if m_prevChosenMinibatchSize (the chosen minibatch size for the previous epoch) div 2
+                // is higher than initialMinibatchSize (the minibatch size we start with for this epoch),
+                // then start the search with m_prevChosenMinibatchSize/2 instead of initialMinibatchSize.
+                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
+                        "largest of previous minibatchSize = (%d / 2) or %d\n",
+                        (int) m_prevChosenMinibatchSize, (int) minMinibatchSize);
+                minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
+            }
+
+            size_t maxMinibatchSize = m_minibatchSizeTuningMax;
+
+            // only grow at most 2 x compared to previous step
+            if (m_prevChosenMinibatchSize != 0.0f)
+            {
+                assert(m_prevChosenMinibatchSize >= chosenMinibatchSize);
+
+                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
+                        "previous minibatchSize %zd*2\n", m_prevChosenMinibatchSize);
+                maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
+            }
+
+            chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber,
+                                                             numFramesToUseInSearch, trainSetDataReader,
+                                                             learnRatePerSample, featureNodes,
+                                                             labelNodes, criterionNodes,
+                                                             evaluationNodes, inputMatrices,
+                                                             learnableNodes, smoothedGradients,
+                                                             minMinibatchSize, maxMinibatchSize);
+        }
+
+        return chosenMinibatchSize;
+    }
+
+    static size_t RoundToMultipleOf64(float val)
+    {
+        return 64 * (size_t)((val + 32) / 64);
+    }
+
+    static size_t RoundToMultipleOf64(size_t val)
+    {
+        return 64 * ((val + 32) / 64);
+    }
+
+    // uses a small percentage of training data of minibatch to
+    // speculatively train with various MB sizes; then picks the best
+    template<class ElemType>
+    size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetwork& net,
+                                      ComputationNetwork& refNet,
+                                      const ComputationNodeBasePtr refNode,
+                                      const int epochNumber,
+                                      const size_t numFramesToUseInSearch,
+                                      IDataReader<ElemType>* trainSetDataReader,
+                                      const double learnRatePerSample,
+                                      const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                      const std::vector<ComputationNodeBasePtr> & labelNodes,
+                                      const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                                      const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                                      std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                                      const std::list<ComputationNodeBasePtr> & learnableNodes,
+                                      std::list<Matrix<ElemType>>& smoothedGradients,
+                                      const size_t minMinibatchSize, const size_t maxMinibatchSize)
+    {
+        // may happen for automatically reduced learning rates
+        if (minMinibatchSize > maxMinibatchSize)
+        {
+            return maxMinibatchSize;
+        }
+
+        size_t trialMinibatchSize = 0;
+        bool isFirstIteration = true;
+        double baseCriterion = 0;
+
+        // increase the minibatch size by a factor of sqrt(2) in each step.
+        const float minibatchSizeTuningFactor = sqrtf(2.0f);
+
+        size_t lastTriedTrialMinibatchSize = 0;
+        double lastTriedTrialEpochCriterion = 0;
+        for (float trialMinibatchSizeFloat = (float)minMinibatchSize;
+             trialMinibatchSizeFloat <= maxMinibatchSize;
+             trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
+        {
+            // round mbsize to something meaningful
+            trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
+
+            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%zd out of range %zd..%zd ...\n\n",
+                    trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
+
+            size_t totalSamplesSeen;
+            std::vector<double> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<double>::infinity());
+            double epochCriterion = std::numeric_limits<double>::infinity();
+
+            // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
+            // minibatches with iteration of this loop.
+            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            learnRatePerSample, trialMinibatchSize, featureNodes,
+                                            labelNodes, criterionNodes,
+                                            evaluationNodes, inputMatrices,
+                                            learnableNodes, smoothedGradients,
+                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                            /*out*/ totalSamplesSeen,
+                                            isFirstIteration ? "BaseAdaptiveMinibatchSearch:" :
+                                                               "AdaptiveMinibatchSearch:");
+
+            if (isFirstIteration)
+            {
+                // for the first iteration of the loop only, set baseCriterion
+                // to the result we got from TrainOneMiniEpochAndReloadModel().
+                baseCriterion = epochCriterion;
+                lastTriedTrialMinibatchSize = trialMinibatchSize;
+                lastTriedTrialEpochCriterion = baseCriterion;
+                isFirstIteration = false;
+
+                fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
+            }
+            else if (!std::isnan(epochCriterion) &&
+                     (epochCriterion > (baseCriterion *  (1.0 + ( m_minibatchSearchCriterionErrorMargin / 100.0)))))
+            {
+                // As soon as we see the Criterion (a measure of error) start to get larger than the
+                // Criterion we started with, we stop.
+                // TODO: if this is too sensitive, we can add a margin on the bases of percentage of
+                // baseCriterion.
+                break;
+            }
+            else
+            {
+                lastTriedTrialMinibatchSize = trialMinibatchSize;
+                lastTriedTrialEpochCriterion = epochCriterion;
+                if (trialMinibatchSizeFloat * minibatchSizeTuningFactor <= maxMinibatchSize)
+                {
+                   fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
+                           "EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
+                           epochCriterion, baseCriterion);
+                }
+            }
+        }
+        fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Chose new minibatchSize of %d. "
+                "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
+                (int) lastTriedTrialMinibatchSize, lastTriedTrialEpochCriterion, baseCriterion);
+
+
+        return lastTriedTrialMinibatchSize;
+    }
+
+    // Tries to compute derivatives for the whole utterances, which will be
+    // fed to the neural network as features.
+    template<class ElemType>
+    void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetwork& net,
+                                            IDataReader<ElemType>* trainSetDataReader,
+                                            const std::vector<ComputationNodeBasePtr> & featureNodes,
+                                            std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
+    {
+        // Tries to read an utterance and run forward computation on the
+        // whole utterance.
+        assert(trainSetDataReader != NULL);
+        std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
+        auto pMBLayout = make_shared<MBLayout>();
+        while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, pMBLayout))
+        {
+            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
+
+            auto & outputNodes = net.OutputNodes();
+            if (outputNodes.empty())
+                LogicError("no output node was found.");
+
+            net.SetActualMiniBatchSizeFromFeatures();
+            trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+            net.Evaluate(outputNodes[0]);   // Only evaluate the first output
+            trainSetDataReader->SetNetOutput(uttInfo,
+                                             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[0])->FunctionValues(),
+                                             pMBLayout);
+        }
+    }
+
+    static string GeneratePaddedFloatOrExpFormat(int padSize, int precision, double value)
+    {
+        char format[16];
+        char buffer[512];
+
+        sprintf(format, "%%.%dg", precision);
+        sprintf(buffer, format, value);
+
+        for (int i = 0; i < strlen(buffer); i++)
+        {
+            if (buffer[i] == 'e' || buffer[i] == 'E')
+            {
+                sprintf(format, "%%%d.%de", padSize, precision);
+                return format;
+            }
+        }
+        sprintf(format, "%%%d.%df", padSize, precision);
+        return format;
+    }
+
+    template<class ElemType>
+    size_t SGD<ElemType>::TrainOneEpoch(ComputationNetwork& net,
+                         ComputationNetwork& refNet,
+                         const ComputationNodeBasePtr refNode,
+                         const int epochNumber,
+                         const size_t epochSize,
+                         IDataReader<ElemType>* trainSetDataReader,
+                         const double learnRatePerSample,
+                         size_t tunedMBSize,
+                         const std::vector<ComputationNodeBasePtr> & featureNodes,
+                         const std::vector<ComputationNodeBasePtr> & labelNodes,
+                         const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                         const std::vector<ComputationNodeBasePtr> & evaluationNodes,
+                         std::map<std::wstring, Matrix<ElemType>*>* inputMatrices,
+                         const std::list<ComputationNodeBasePtr> & learnableNodes,
+                         std::list<Matrix<ElemType>>& smoothedGradients,
+                         /*out*/ double& epochCriterion,
+                         /*out*/ std::vector<double>& epochEvalErrors,
+                         /*out*/ size_t& totalSamplesSeen,
+                         std::string prefixMsg)
+    {
+        // Since we are getting timing resolution of under microsecond we use double precision
+        // to ensure that we have enough digits to represent small time measurements.
+        double totalTimeInMBs = 0;
+        double epochCriterionLastMBs = 0;
+
+        int numSamplesLastMBs = 0;
+        std::vector<double> epochEvalErrorsLastMBs(epochEvalErrors.size(), 0);
+
+        // initialize statistics
+        size_t totalEpochSamples = 0;
+
+        int numMBsRun = 0;
+
+        size_t numEvalNodes = epochEvalErrors.size();
+
+        // NOTE: the following two local matrices are not used in distGradAgg path
+        // assume only one training criterion node for each epoch
+
+        Matrix<ElemType> localEpochCriterion(1, 1, net.GetDeviceId());
+        Matrix<ElemType> localEpochEvalErrors(1, numEvalNodes, net.GetDeviceId());
+
+        localEpochCriterion.SetValue(0);
+        localEpochEvalErrors.SetValue(0);
+
+        bool useGradientAggregation = ((m_parallelizationMethod == ParallelizationMethod::DataParallelSGD) &&
+                                       (epochNumber >= m_parallelizationStartEpochNum));
+        bool useModelAveraging = ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) &&
+                                  (epochNumber >= m_parallelizationStartEpochNum));
+        bool useParallelTrain = useGradientAggregation || useModelAveraging; 
+
+        // MA-related variables
+        size_t nSamplesSinceLastModelSync = 0;
+        size_t nSynced = 0; 
+        float  nSecondsOnMASync = 0; 
+        float  nSecondsSinceLastMAPerfReport = 0;
+
+        if (useGradientAggregation)
+        {
+            epochCriterion = double(0.0);
+            epochEvalErrors.assign(numEvalNodes, double(0.0));
+        }
+
+        Profiler profiler(m_numMBsToCUDAProfile);
+
+        // resetting this, so profiling is performed for one epoch only
+        m_numMBsToCUDAProfile = 0;
+
+        bool useDistributedMBReading = useParallelTrain &&
+                                       m_enableDistributedMBReading &&
+                                       trainSetDataReader->SupportsDistributedMBRead();
+        if (useDistributedMBReading)
+        {
+            trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(), g_mpi->NumNodesInUse(), m_epochSize);
+        }
+        else
+        {
+            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
+        }
+
+        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
+
+        fprintf(stderr, "\nStarting minibatch loop");
+        if (useGradientAggregation)
+        {
+            fprintf(stderr, ", DataParallelSGD training (MyRank = %d, NumNodes = %d, NumGradientBits = %d)", (int)g_mpi->CurrentNodeRank(), (int)g_mpi->NumNodesInUse(), (int)m_numGradientBits);
+        }
+
+        if (useDistributedMBReading)
+        {
+            fprintf(stderr, ", Distributed reading is ENABLED");
+        }
+        fprintf(stderr, ".\n");
+
+        Timer timer;
+        timer.Start();
+
+        // --- MAIN MINIBATCH LOOP
+
+        for (;;)
+        {
+            bool wasDataRead = trainSetDataReader->GetMinibatch(*inputMatrices);
+
+            if (useDistributedMBReading)
+            {
+                // In case of distributed reading, the current node needs to continue even with a minibatch size of 0 if any
+                // other node in the group has a non-zero size minibatch to process. This is needed to ensure that
+                // the gradient aggregation barriers do not get stuck and also to ensure that all nodes update their weights
+                // properly using the aggregate gradients from other nodes before moving on to the next epoch even though the current
+                // node itself may not have any gradient contribution.
+                std::array<int, 1> numNodesWithDataToProcess;
+                numNodesWithDataToProcess[0] = wasDataRead ? 1 : 0;
+                g_mpi->AllReduce(numNodesWithDataToProcess);
+
+                if (numNodesWithDataToProcess[0] == 0)
+                {
+                    break;
+                }
+            }
+            else if (!wasDataRead)
+            {
+                break;
+            }
+
+            size_t actualMBSize = 0;
+            if (wasDataRead)
+            {
+                size_t nSlices = trainSetDataReader->GetNumParallelSequences();
+                MBLayoutPtr pMBLayout;
+                if (!useDistributedMBReading && useParallelTrain)
+                {
+                    // TODO: refactor this as a function 
+                    if (trainSetDataReader->RequireSentenceSeg())
+                    {
+                        pMBLayout = make_shared<MBLayout>();    // items get filled in
+                        DecimateMinibatchWithSentences(*inputMatrices,
+                                                       g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank(),
+                                                       nSlices, pMBLayout,
+                                                       trainSetDataReader);
+                    }
+                    else
+                    {
+                        DecimateMinibatch(*inputMatrices, g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank());
+                    }
+                }
+
+                actualMBSize = net.SetActualMiniBatchSizeFromFeatures();
+                if (actualMBSize != 0)
+                {
+                    if (!useDistributedMBReading && useParallelTrain && trainSetDataReader->RequireSentenceSeg())
+                    {
+                        *net.GetMBLayoutPtr() = *pMBLayout;
+                        // TODO: ^^ we should just pass pointers; this current code is semantically identical to before the change to MBLayout
+                        net.VerifyActualNumParallelSequences(nSlices);
+                    }
+                    else
+                    {
+                        trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
+                        net.VerifyActualNumParallelSequences(nSlices);
+                    }
+
+                    nSamplesSinceLastModelSync += actualMBSize;
+
+                    ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
+                    ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
+
+#ifndef EVALDLL
+                    if (m_doGradientCheck && GradientCheck(net, criterionNodes, learnableNodes, 0) == false)
+                        LogicError("cannot pass gradient checker");
+#endif
+                    // TODO: currently only support one node regularization
+                    if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
+                    {
+#if 1
+                        size_t actualMBSize2 = refNet.SetActualMiniBatchSizeFromFeatures();
+                        if (actualMBSize2 != actualMBSize)
+                            LogicError("TrainOneEpoch: refNet has different MB size than main net??");
+#else
+                        refNet.SetActualMiniBatchSize(actualMBSize);            // TODO: SetActualMiniBatchSizeFromFeatures() should have the same result, no?
+#endif
+                        *refNet.GetMBLayoutPtr() = *net.GetMBLayoutPtr();       // TODO: This is UNTESTED (before this was missing, seemingly inconsistently)
+                        refNet.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
+
+                        refNet.Evaluate(refNode);
+                        Matrix<ElemType>::ScaleAndAdd((ElemType)m_adaptationRegWeight,
+                                                      dynamic_pointer_cast<ComputationNode<ElemType>>(refNode)->FunctionValues(),
+                                                      (ElemType)(1.0 - m_adaptationRegWeight),
+                                                      dynamic_pointer_cast<ComputationNode<ElemType>>(labelNodes[0])->FunctionValues());
+                    }
+
+                    //compute eval node first since when gradient is computed the forward function values
+                    //may be changed and need to be recomputed when gradient and function value share the same matrix
+                    for (size_t i = 0; i < numEvalNodes; i++)
+                    {
+                        net.Evaluate(evaluationNodes[i]);
+                    }
+
+                    // only compute gradient when learning rate is large enough
+                    if (learnRatePerSample > m_minLearnRate * 0.01)
+                    {
+                        // use only the first criterion. Is there any possibility to use more?
+                        net.ComputeGradient<ElemType>(criterionNodes[0]);
+                    }
+                    else
+                    {
+                        // use only the first criterion. Is there any possibility to use more?
+                        net.Evaluate(criterionNodes[0]);
+                    }
+                }
+            }
+
+            //for now since we share the same label masking flag we call this on the network. 
+            //Later, when we apply different labels on different nodes
+            //we need to add code to call this function multiple times, one for each criteria node
+            size_t numSamplesWithLabel = net.GetNumSamplesWithLabel(actualMBSize);
+
+            // Sum of actualMBSize across all nodes when using parallel training
+            size_t aggregateNumSamples = actualMBSize;
+            size_t aggregateNumSamplesWithLabel = numSamplesWithLabel;
+
+            //distributed gradient aggregation
+            if (!useGradientAggregation)
+            {
+                if (actualMBSize != 0)
+                {
+                    Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->FunctionValues(), 0, 0, localEpochCriterion, 0, 0);
+                    for (size_t i = 0; i < numEvalNodes; i++)
+                        Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i);
+                }
+            }
+            else
+            {
+                LazyInitDistGradAgg(learnableNodes, numEvalNodes, m_traceLevel);
+
+                //prepare the header
+                m_gradHeader->numEvalNode = numEvalNodes;
+                m_gradHeader->numSamples = actualMBSize;
+                m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
+                m_gradHeader->criterion = wasDataRead ? criterionNodes[0]->Get00Element() : 0.0;
+                for (size_t i = 0; i < numEvalNodes; i++)
+                    m_gradHeader->evalErrors[i] = wasDataRead ? evaluationNodes[i]->Get00Element() : 0.0;
+
+                m_distGradAgg->AggregateGradients(m_gradHeader, epochNumber);
+
+                aggregateNumSamples = m_gradHeader->numSamples;
+                aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
+                epochCriterion += m_gradHeader->criterion;
+                for (size_t i = 0; i<numEvalNodes; i++)
+                    epochEvalErrors[i] += m_gradHeader->evalErrors[i];
+                }
+
+            //update model parameters
+            if ((aggregateNumSamples > 0) && (learnRatePerSample > m_minLearnRate * 0.01))
+            {
+                auto smoothedGradientIter = smoothedGradients.begin();
+                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
+                {
+                    ComputationNodeBasePtr node = *nodeIter;
+                    Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+
+                    UpdateWeights(node, smoothedGradient, learnRatePerSample,
+                                  m_momentumPerSample[epochNumber], aggregateNumSamples,
+                                  m_L2RegWeight, m_L1RegWeight,
+                                  m_needAveMultiplier);
+                }
+            }
+    
+            if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
+            {
+                size_t processedSamples = 0; 
+                float secondsSinceLastSyncFinished = 0; 
+                float secondsSpentOnSync = 0;
+                if (ModelAveragingProcessing(nSamplesSinceLastModelSync, learnableNodes, processedSamples,
+                                             secondsSinceLastSyncFinished, secondsSpentOnSync))
+                {
+                    // if a sync happens, do some extra work
+                    nSamplesSinceLastModelSync = 0; 
+                    nSynced++;
+
+                    nSecondsOnMASync += secondsSpentOnSync; 
+                    nSecondsSinceLastMAPerfReport += secondsSinceLastSyncFinished; 
+                    
+                    if (m_iMASyncStatsTrace > 0)
+                    {
+                        if (nSynced % m_iMASyncStatsTrace == 0)
+                        {
+                            fprintf(stderr, "\t\t-----(model averaging stats) %d-th sync, %8.2f seconds since last report, %5.2f seconds on communication\n",
+                                    (int)nSynced, nSecondsSinceLastMAPerfReport, nSecondsOnMASync);
+                            nSecondsOnMASync = 0; 
+                            nSecondsSinceLastMAPerfReport = 0; 
+                        }
+                    }
+                }
+                aggregateNumSamplesWithLabel = processedSamples;
+            }
+
+            timer.Stop();
+            numMBsRun++;
+            if (m_traceLevel > 0)
+            {
+                totalTimeInMBs += timer.ElapsedSeconds();
+                numSamplesLastMBs += useModelAveraging ? int(actualMBSize) : int(aggregateNumSamplesWithLabel);
+
+                if (numMBsRun % m_numMBsToShowResult == 0)
+                {
+                    // get the epoch Values updated
+                    if (!useGradientAggregation)
+                    {
+                        timer.Restart();
+                        epochCriterion = localEpochCriterion.Get00Element();
+                        for (size_t i = 0; i < numEvalNodes; i++)
+                            epochEvalErrors[i] = localEpochEvalErrors(0, i);
+                        timer.Stop();
+
+                        // Add the last trailing compute
+                        totalTimeInMBs += timer.ElapsedSeconds();
+                    }
+
+                    double trainLossPerSample = (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs;
+                    string formatString = "%s Epoch[%2d of %d]-Minibatch[%4d-%4d of %d]: SamplesSeen = %d; TrainLossPerSample = " +
+                                          GeneratePaddedFloatOrExpFormat(11, 8, trainLossPerSample) + "; ";
+                    fprintf(stderr, formatString.c_str(),
+                            prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
+                            numMBsRun, epochSize / tunedMBSize, numSamplesLastMBs, trainLossPerSample);
+
+                    for (size_t i = 0; i < numEvalNodes; i++)
+                    {
+                        double evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
+                        formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
+                        fprintf(stderr, formatString.c_str(), i, evalError);
+                    }
+
+                    double totalTimePerSample = (1000.0 * totalTimeInMBs) / numSamplesLastMBs;
+                    formatString = "TotalTime = " + GeneratePaddedFloatOrExpFormat(0, 5, totalTimeInMBs) + "s; TotalTimePerSample = " +
+                                   GeneratePaddedFloatOrExpFormat(0, 5, totalTimePerSample) + "ms; SamplesPerSecond = %d\n";
+                    fprintf(stderr, formatString.c_str(),
+                            totalTimeInMBs, totalTimePerSample,
+                            static_cast<int>(numSamplesLastMBs / totalTimeInMBs));
+
+                    fflush(stderr);
+
+                    // reset statistics
+                    totalTimeInMBs = 0;
+                    numSamplesLastMBs = 0;
+
+                    epochCriterionLastMBs = epochCriterion;
+                    for (size_t i = 0; i < numEvalNodes; i++)
+                        epochEvalErrorsLastMBs[i] = epochEvalErrors[i];
+
+                    if (std::isnan(epochCriterion))
+                        RuntimeError("The training criterion is not a number (NAN). Stop\n");
+                    }
+                }
+
+            timer.Restart();
+            totalEpochSamples += aggregateNumSamplesWithLabel;
+            totalSamplesSeen += aggregateNumSamplesWithLabel;
+
+            if (totalEpochSamples >= epochSize)
+                break;
+
+            // call DataEnd function
+            // DataEnd does reader specific process if sentence ending is reached
+            trainSetDataReader->DataEnd(endDataSentence);
+
+            // Tries to set up derivative features for the next utterance.
+            AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
+
+            profiler.NextSample();
+        }
+
+        // --- END MAIN MINIBATCH LOOP
+
+        if (useModelAveraging && (g_mpi->NumNodesInUse() > 1) )
+        {
+            // may not be synced after epoch finished, so do the sync here 
+            int residualSampels = (int)nSamplesSinceLastModelSync;
+            g_mpi->AllReduce(&residualSampels, 1);
+            totalSamplesSeen += residualSampels; 
+            totalEpochSamples += residualSampels;
+            ModelAveragingSync(nSamplesSinceLastModelSync, learnableNodes);
+            nSynced++;
+            nSamplesSinceLastModelSync = 0;
+        }
+
+        if (useGradientAggregation)
+        {
+            epochCriterion /= float(totalEpochSamples);
+            for (size_t i = 0; i< numEvalNodes; i++)
+                epochEvalErrors[i] /= totalEpochSamples;
+        }
+        else
+        {
+            localEpochCriterion /= float(totalEpochSamples);
+            localEpochEvalErrors /= float(totalEpochSamples);
+
+            epochCriterion = localEpochCriterion.Get00Element();
+            for (size_t i = 0; i < numEvalNodes; i++)
+                epochEvalErrors[i] = localEpochEvalErrors(0, i);
+        }
+
+
+        if (useModelAveraging && (g_mpi->NumNodesInUse() > 1))
+        {
+            // merge epochCriterion and epochEvalErrors over nodes 
+            g_mpi->AllReduce(&epochCriterion, 1);
+            g_mpi->AllReduce(epochEvalErrors);
+        }
+        return totalEpochSamples;
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::LazyInitDistGradAgg(const std::list<ComputationNodeBasePtr>& learnableNodes, int numEvalNodes, int traceLevel)
+    {
+        if (m_parallelizationMethod == ParallelizationMethod::DataParallelSGD)
+        {
+            if (m_distGradAgg == nullptr)
+            {
+                std::vector<Matrix<ElemType>*> learnParamsGradients;
+                learnParamsGradients.reserve(learnableNodes.size());
+                for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+                {
+                    ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+                    learnParamsGradients.push_back(&(node->GradientValues()));
+                }
+
+                m_distGradAgg = new AllReduceDistGradAggregator<ElemType>(learnParamsGradients, numEvalNodes, m_numGradientBits, g_mpi, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, traceLevel);
+            }
+
+            if (m_gradHeader == nullptr)
+            {
+                m_gradHeader = DistGradHeader::Create(numEvalNodes);
+            }
+        }
+    }
+
+    template<class ElemType>
+    bool SGD<ElemType>::ModelAveragingProcessing(size_t nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes, size_t& nProcessedFrames,
+                                  float& SecondsSinceLastSyncFinished, float& SecondsSpentOnSync)
+    {
+        //////////////////////////////////////////////////////////////////////////
+        // the current strategy is that after each minibatch, we will sync between processors 
+        // to decide whether a sync need to be performed. This is definitely not optimal, 
+        // which we will fix it later. 
+
+        // TODO: the way we handle timer is not very good 
+        //////////////////////////////////////////////////////////////////////////
+        static bool first = true ; 
+        static Timer MAtimer;
+        if (first)
+        {
+            MAtimer.Start(); 
+            first = false; 
+        }
+       
+        char bNeedToSync = (char)0; // use char for bool 
+        if (g_mpi->IsMainNode() && nSamplesSinceLastSync >= m_nFramesBetweenMASync)
+        {
+            // only the main node can decide whether a sync need to be performed 
+            bNeedToSync = (char)1; 
+        }
+        g_mpi->Bcast(&bNeedToSync, 1, g_mpi->MainNodeRank());
+        if (bNeedToSync)
+        {
+            MAtimer.Stop();
+            double elapsedsec = MAtimer.ElapsedSeconds(); 
+            SecondsSinceLastSyncFinished = first ?  0  : (float) elapsedsec  ;
+            MAtimer.Start();
+            nProcessedFrames = ModelAveragingSync((int)nSamplesSinceLastSync, learnableNodes);
+            MAtimer.Stop();
+            SecondsSpentOnSync = (float)MAtimer.ElapsedSeconds();
+            
+            MAtimer.Start();
+        }
+        else
+        {
+            nProcessedFrames = 0; 
+            return false;
+        }
+        return true; 
+    }
+
+    template<class ElemType>
+    size_t SGD<ElemType>::ModelAveragingSync(int nSamplesSinceLastSync, const std::list<ComputationNodeBasePtr>& learnableNodes)
+    {
+        if (g_mpi->NumNodesInUse() <= 1)
+        {
+            return nSamplesSinceLastSync; 
+        }
+
+        //========================================
+        // Sec. 1 calculate factor
+        //========================================
+        float factor = 0; 
+        int   nTotalSamples = nSamplesSinceLastSync; 
+        g_mpi->AllReduce(&nTotalSamples, 1);
+        if (nTotalSamples <= 0)
+        {
+            // prepare for overflow 
+            factor = 1.0f / g_mpi->NumNodesInUse(); 
+        }
+        else
+        {
+            factor = (nSamplesSinceLastSync + 0.0f) / nTotalSamples; 
+        }
+
+        //========================================
+        // Sec. 2 sync models based on factor 
+        // Note: this is suboptimal at the moment: 
+        //       we do the averaging for each node in a sequence manner, i.e., 
+        //          (node1) GPU->CPU->MPI_AllReduce -> (node2)GPU->CPU->MPI_AllReduce
+        //       we can improve it by using a pipeline 
+        //          (node1) GPU ->  CPU  ->  MPI_AllReduce
+        //          (node2)         GPU  ->  CPU            -> MPI_AllReduce
+        //          (node3)                  GPU            -> CPU              -> MPI_AllReduce
+        //========================================
+        for (auto iter = learnableNodes.begin(); iter != learnableNodes.end(); iter++)
+        {
+            ComputationNodeBasePtr pNode = *iter; 
+            if (!pNode->NeedGradient())
+                continue;
+
+            Matrix<ElemType>& mat = dynamic_pointer_cast<ComputationNode<ElemType>>(pNode)->FunctionValues();
+            // 1. normalize the weight matrix 
+            Matrix<ElemType>::Scale(factor, mat);
+            // 2. send weight matrix over MPI nodes; 
+            ElemType* px = mat.CopyToArray(); 
+            size_t    nx = mat.GetNumElements(); 
+
+            // 3. inplace sum 
+            g_mpi->AllReduce(px, nx);
+            mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), px);
+            // 4. clean up 
+            delete []px; 
+        }
+
+        return nTotalSamples; 
+    }
+    
+// public:
+    // UpdateWeightsS - static version of UpdateWeights()
+    // not static since it wants to access protected methods on the SGD object
+    template<class ElemType>
+    /*static*/ void SGD<ElemType>::UpdateWeightsS(const SGD<ElemType>* sgd, Matrix<ElemType>& functionValues,
+                               Matrix<ElemType>& gradientValues,
+                               Matrix<ElemType>& smoothedGradient,
+                               const double learnRatePerSample,
+                               const double momentumPerSample,
+                               size_t actualMBSize,
+                               const double L2RegWeight,
+                               const double L1RegWeight,
+                               const bool needAveMultiplier)
+    {
+        // we use simple linear (instead of log linear) scaling here
+        const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
+#if DUMPOUTPUT
+        fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+                learnRatePerSample, momentum, actualMBSize);
+        fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
+                sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
+        gradientValues.Print("Gradient Input");
+        smoothedGradient.Print("Smoothed Gradient Input");
+#endif
+
+        // make actualMBSize is a valid value
+        assert(actualMBSize > 0);
+
+        //clipping gradients to prevent outliers
+        sgd->ClipGradient(gradientValues, actualMBSize);
+
+        GradientsUpdateType adpType = sgd->GradUpdateType();
+        double noiseStd = sgd->GradientUpdateNoiseStd();
+        Matrix<ElemType> sgdUpdateNoise((DEVICEID_TYPE)functionValues.GetDeviceId());
+        if (noiseStd > 0)
+        {
+            // get the gradient structure since gradient is sparse
+            sgdUpdateNoise.SetValue(gradientValues);
+
+            // reset its value to random
+            sgdUpdateNoise.SetGaussianRandomValue(0, (ElemType)noiseStd);
+        }
+
+        // L2 regularizer
+        if (L2RegWeight > 0)
+        {
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            Matrix<ElemType>::ScaleAndAdd((ElemType)(L2RegWeight * actualMBSize), functionValues, gradientValues);
+        }
+
+        if (adpType == GradientsUpdateType::None)
+        {
+            smoothedGradient.NormalGrad(gradientValues, functionValues,
+                                        (ElemType)learnRatePerSample, (ElemType)momentum);
+        }
+        else if (adpType == GradientsUpdateType::AdaGrad ||
+                (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
+                (adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE))
+        {
+            //rmsprop for sparse is not implemented yet, delegate it with adagrad
+
+            double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
+            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
+        }
+        else if (adpType == GradientsUpdateType::FSAdaGrad)
+        {
+            smoothedGradient.FSAdagrad(actualMBSize, gradientValues, functionValues, learnRatePerSample, momentum);
+        }
+        else if (adpType == GradientsUpdateType::RmsProp)
+        {
+            double aveMultiplier = smoothedGradient.RmsProp(gradientValues, (ElemType)sgd->m_rpi.gamma,
+                                                              (ElemType)sgd->m_rpi.inc, (ElemType)sgd->m_rpi.max,
+                                                              (ElemType)sgd->m_rpi.dec, (ElemType)sgd->m_rpi.min, needAveMultiplier);
+            Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
+        }
+
+        if (noiseStd > 0)
+        {
+            Matrix<ElemType>::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues);
+        }
+
+        // L1 regularizer with proximal gradient descent method
+        if (L1RegWeight > 0)
+        {
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            functionValues.InplaceSoftThreshold((ElemType)(learnRatePerSample * L1RegWeight * actualMBSize));
+        }
+
+#if DUMPOUTPUT
+        functionValues.Print("Parameter Update");
+#endif
+    }
+
+// protected:
+
+    // UpdateWeights - update the weights in
+    template<class ElemType>
+    void SGD<ElemType>::UpdateWeights(const ComputationNodeBasePtr node,
+                       Matrix<ElemType>& smoothedGradient,
+                       const double learnRatePerSample,
+                       const double momentumPerSample,
+                       const size_t actualMBSize,
+                       const double L2RegWeight, const double L1RegWeight,
+                       const bool needAveMultiplier) const
+    {
+#if DUMPOUTPUT
+        fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
+#endif
+        UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->FunctionValues(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->GradientValues(),
+                       smoothedGradient, learnRatePerSample, momentumPerSample,
+                       actualMBSize, L2RegWeight, L1RegWeight,
+                       needAveMultiplier);
+        node->UpdateEvalTimeStamp();
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
+    {
+        if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
+        {
+            double maxGradientPerMB = m_clippingThresholdPerSample * actualMBSize;
+            if (m_gradientClippingWithTruncation)
+                gradient.InplaceTruncate((ElemType)(maxGradientPerMB));
+            else
+            {
+                // norm2 normalized
+                double gradientNorm = gradient.FrobeniusNorm();
+                if (gradientNorm > maxGradientPerMB)
+                {
+                    double normFactor = maxGradientPerMB / gradientNorm;
+                    gradient *= (ElemType)normFactor;
+                }
+            }
+        }
+    }
+
+    template<class ElemType>
+    void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
+                            const double learnRatePerSample,
+                            const std::list<Matrix<ElemType>>& smoothedGradients,
+                            const double prevCriterion,
+                            const size_t minibatchSize)
+    {
+        // In case of parallel training only the main node should we saving the checkpoint to prevent
+        // the parallel training nodes from colliding to write the same file
+        if ((g_mpi == nullptr) || g_mpi->IsMainNode())
+        {
+            wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
+            // Saving into temporary file and then renaming it to the checkPointFileName
+            // This is a standard trick to avoid havign corrupted checkpoints files if process dies during writing
+            wstring tempFileName = checkPointFileName + L".tmp";
+
+            {
+                File fstream(tempFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsWrite);
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+                fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
+                fstream << minibatchSize;
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+
+                for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+                {
+                    const Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+                    fstream << smoothedGradient;
+                }
+
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+
+                // Ensuring that data is written
+                fstream.Flush();
+            }
+
+            renameOrDie(tempFileName, checkPointFileName);
+        }
+    }
+
+    template<class ElemType>
+    bool SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
+                            /*out*/ size_t& totalSamplesSeen,
+                            /*out*/ double& learnRatePerSample,
+                            std::list<Matrix<ElemType>>& smoothedGradients,
+                            /*out*/ double& prevCriterion,
+                            /*out*/ size_t& minibatchSize)
+    {
+        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
+        if (!fexists(checkPointFileName.c_str()))
+        {
+            fprintf(stderr, "Warning: checkpoint file is missing. learning parameters will be initialized from 0\n");
+            return false;
+        }
+
+        File fstream(checkPointFileName,
+                     FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
+        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
+
+        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
+        fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
+        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
+
+        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
+        {
+            fstream >> minibatchSize;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+        }
+        else
+        {
+            minibatchSize = m_mbSize[epochNumber];
+        }
+
+        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
+
+        for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
+        {
+            Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
+            fstream >> smoothedGradient;
+        }
+        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
+
+        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECKP");
+
+        return true;
+    }
+
+    template<class ElemType>
+    wstring SGD<ElemType>::GetCheckPointFileNameForEpoch(const int epoch)
+    {
+        return GetModelNameForEpoch(epoch) + L".ckp";
+    }
+
+    template<class ElemType>
+    wstring SGD<ElemType>::GetModelNameForEpoch(const int epoch, bool bLastModel)
+    {
+        int epoch1Base = epoch + 1;
+        if (epoch1Base == m_maxEpochs || bLastModel)
+        {
+            return m_modelPath;
+        }
+        else
+        {
+            wstring w = msra::strfun::wstrprintf(L"%ls.%d", m_modelPath.c_str(), (int)epoch1Base);
+            return w;
+        }
+
+    }
+
+    // return -1 if nothing exists
+    template<class ElemType> // TODO: needed?
+    int SGD<ElemType>::DetermineStartEpoch(const bool makeMode)
+    {
+        if (!makeMode)
+        {
+            // always start from scratch
+            return -1;
+        }
+
+        int firstEpoch = -1;
+
+        wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1);
+        for (int e = int(m_maxEpochs) - 1; e >= -1; e--)
+        {
+            const wstring prevEpochFile = GetModelNameForEpoch(e - 1);
+
+            if (msra::files::fuptodate(curEpochFile, prevEpochFile, false))
+            {
+                firstEpoch = size_t(e) + 1;
+                break;
+            }
+            else
+            {
+                curEpochFile = prevEpochFile;
+            }
+        }
+
+        return firstEpoch;
+    }
+
+#define EPSILON 1e-5
+
+    template<class ElemType>
+    bool SGD<ElemType>::GradientCheck(ComputationNetwork& net,
+                       const std::vector<ComputationNodeBasePtr> & criterionNodes,
+                       const std::list<ComputationNodeBasePtr> & learnableNodes,
+                       int npos)
+    {
+        vector<string> errMsgs;
+
+        // gradient checking
+        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
+        {
+            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+            char wstrtmp[2048];
+
+            for (size_t itry = 0; itry < min((size_t)50, node->FunctionValues().GetNumElements()); itry++)
+            {
+                /// no support to sparse matrix yet
+                int irow = (int) fmod(rand(), node->FunctionValues().GetNumRows() - 1);
+                int icol = (int) fmod(rand(), node->FunctionValues().GetNumCols() - 1);
+                irow = max(0, irow);
+                icol = max(0, icol);
+
+                fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str());
+
+                double eOrg = node->FunctionValues()(irow, icol);
+                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
+
+                node->UpdateEvalTimeStamp();
+
+                // use only the first criterion. Is
+                net.ComputeGradient<ElemType>(criterionNodes[npos]);
+
+                if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
+                {
+                    break;
+                }
+
+                //double mbEvalCri =
+                //criterionNode should be a scalar
+                // TODO: why is this value not used?
+                criterionNodes[npos]->Get00Element();
+                double eGradErr = node->GradientValues()(irow, icol);
+                node->GradientValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
+
+                double ePos = eOrg + EPSILON;
+                double eNeg = eOrg - EPSILON;
+
+                node->FunctionValues()(irow, icol) = (ElemType)ePos;
+                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
+
+                node->UpdateEvalTimeStamp();
+                net.Evaluate(criterionNodes[npos]);
+                //criterionNode should be a scalar
+
+                double mbEvalCriPos = criterionNodes[npos]->Get00Element(); // TODO: make Get00Element() a function of ComputationNodeBase
+
+                node->FunctionValues()(irow, icol) = (ElemType)eNeg;
+                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
+
+                node->UpdateEvalTimeStamp();
+                net.Evaluate(criterionNodes[npos]);
+
+                // criterionNode should be a scalar
+                double mbEvalCriNeg = criterionNodes[npos]->Get00Element();
+
+                // back to its orginal parameter value
+                node->FunctionValues()(irow, icol) = (ElemType)eOrg;
+                node->FunctionValues().TransferToDeviceIfNotThere(net.GetDeviceId(), true);
+
+                // check if they are consistent
+                double eGradNum = ((mbEvalCriPos - mbEvalCriNeg) / (ePos - eNeg));
+                double threshold = pow(10.0,
+                                       max(0.0,
+                                                        ceil(log10(min(fabs(eGradErr),
+                                                    fabs(eGradNum))))) - (int)m_gradientCheckSigDigit);
+                double diff = fabs(eGradErr - eGradNum);
+                bool wrong = (std::isnan(diff) || diff > threshold);
+                if (wrong)
+                {
+                    fprintf(stderr, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
+                            node->NodeName().c_str(), eGradNum, eGradErr);
+                    sprintf(wstrtmp, "\nd%ls Numeric gradient = %e, Error BP gradient = %e\n",
+                            node->NodeName().c_str(), eGradNum, eGradErr);
+                    errMsgs.push_back(wstrtmp);
+                }
+            }
+        }
+
+        return errMsgs.size() == 0;
+        }
+
+template class SGD<float>;
+template class SGD<double>;
+
+// TODO: does not build--but part is used directly from CNTK.cpp
+//template class MultiNetworksSGD<float>;
+//template class MultiNetworksSGD<double>;
+
+}}}
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index aaaeb36b2b85..cf3fbf4bdef6 100755
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -1277,6 +1277,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
+    template<class ElemType>
+    void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, 
+        ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul)
+    {
+        size_t numColsNeeded = 2 * gradients.GetNumCols();
+
+        if (IsEmpty() || GetNumCols() < numColsNeeded)
+        {
+            Resize(gradients.GetNumRows(), numColsNeeded);
+            SetValue(0.0);
+        }
+
+        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded);
+
+        size_t n = gradients.GetNumElements();
+        int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
+        _fsadagrad<ElemType><<<blocksPerGrid, threadsPerBlock>>>(n, gradients.m_pArray, m_pArray, m_pArray + n, functionValues.m_pArray,
+            learnRatePerSample, momentum, adaWeight, adaMul);
+    }
+
     template<class ElemType>
     ElemType GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients,
         ElemType RMS_GAMMA,
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index ee7bc2139be9..b80900f95ad7 100755
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -124,6 +124,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ElemType* BufferPointer() const {return m_pArray;}
 
         ElemType Adagrad(GPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
+        void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, 
+            ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
         ElemType RmsProp(GPUMatrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
 
         void Reshape(const size_t numRows, const size_t numCols);
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 18982cc13488..d45b9e006d87 100755
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -1110,6 +1110,36 @@ __global__ void _adagrad4BlockSparse(
         multipliers[id] = 1 / temp;
 }
 
+template<class ElemType>
+__global__ void _fsadagrad(CUDA_LONG size, ElemType* grad, ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
+    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul)
+{
+    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
+    CUDA_LONG stride = blockDim.x * gridDim.x;
+    for (; idx < size; idx += stride)
+    {
+        ElemType g = grad[idx];
+        ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
+        smoothAda[idx] = adaSqr;
+        if (adaSqr != 0.0f)
+        {
+            ElemType w = adaMul * rsqrtf(adaSqr);
+            if (w > 10.0f)
+                w = 10.0f;
+            g *= w;
+        }
+
+        if (mom > 0.0f)
+        {
+            g = mom * smoothMom[idx] + (1.0f - mom) * g;
+            smoothMom[idx] = g;
+        }
+
+        g *= lr;
+        val[idx] -= g;
+    }
+}
+
 template<class ElemType>
 __global__ void _rmsprop_init(
     ElemType* avars, ElemType* signs, ElemType* steps,
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index e122905be16f..1707c8222851 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -1296,6 +1296,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return aveMultiplier;
     }
 
+    template<class ElemType>
+    void Matrix<ElemType>::FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
+    {
+        // REVEW alexeyk: hardcoded for now, taken from DBN. Naming is the same as in DBN.
+        const size_t adagradT = 2 * 3600 * 100;
+        const ElemType adagradkeepweight = static_cast<ElemType>(exp(-1.0 * mbSize / adagradT));
+
+        const ElemType targetadagradavdenom = 0.0025; // 1/400 magic constant
+        static ElemType aggadagradsqrframes = 0;
+        aggadagradsqrframes = adagradkeepweight * aggadagradsqrframes + (1.0f - adagradkeepweight) * mbSize;
+        const ElemType targetadagradavdenom_x_sqrtadagradsqrframes = static_cast<ElemType>(targetadagradavdenom * sqrt(aggadagradsqrframes));
+
+        DISPATCH_MATRIX_ON_FLAG(&gradients,
+            &gradients,
+            SetDataLocation(CPU),
+            m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(GPU),
+            NOT_IMPLEMENTED,
+            NOT_IMPLEMENTED
+            );
+    }
+
     template<class ElemType>
     ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
         ElemType RMS_GAMMA,
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 34eac851b7f8..d848d2e174f6 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -27,41 +27,41 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // there is a version down there of ColumnSlice() that abstracts the number of streams
     // TODO: This may not belong here, but having it in ComputeNode would require syntax changes, while having it as a member here only requires a local find-replace. Let's make it work first, then decide how to refactor.
-    // the looping versions of EvaluateThisNode() and ComputeInputPartial() take a frame range, through this structure
-    // It can cast from a size_t, i.e. those functions can be called passing a size_t in place of the FrameRange.
-    // TODO: GetNumParallelSequences() should be subsumed here & removed from nodes
-    // TODO: Where this design currently breaks:
-    //  - BatchModeNodes must access GetNumParallelSequences(), yet operate on the whole sequence
-    //  - likewise, LSTMNode does its own iteration, hence needs access to GetNumParallelSequences() or NumCols() in the whole-batch iterator
-    //  - RecurrentNodes access frames with a time shift, where out-of-bounds ones access a different matrix' values
-    //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
-    //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)
-    // TODO: Turns out, a FrameRange is either a whole batch or a single frame.
-    struct FrameRange
-    {
-        const size_t timeIdxInSeq;              // start frame
-        const size_t samplesInRecurrentStep;    // number of samples in this step       --BUGBUG: this should be part of MBLayout, not FrameRange
-        // can construct from a single size_t -> a single-frame range
-        //FrameRange(size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(0)/*FIX THIS*/{}
-        FrameRange(size_t timeIdxInSeq, size_t samplesInRecurrentStep) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(samplesInRecurrentStep){}
-        // or without arguments -> entire minibatch / no frame-range
-        FrameRange() : timeIdxInSeq(0), samplesInRecurrentStep(SIZE_MAX) {}
-        // code that can only handle single-frame ranges will call t() to get the time index, which will throw if numFrames != 1
-        // Some functions need just the time index, e.g. for looking up stuff in m_boundaryInfo. That's where an unscaled index is needed (as opposed to startColumn()).
-        size_t t() const { EnsureNotAllFrames(); return timeIdxInSeq; }
-        // multi-frame slice case: these two get startFrame and numFrames
-        size_t StartColumn() const { EnsureNotAllFrames(); return timeIdxInSeq * samplesInRecurrentStep; }
-        size_t NumCols() const { EnsureNotAllFrames(); return samplesInRecurrentStep; }
-        bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
-    private:
-        FrameRange(const FrameRange & other);// : timeIdxInSeq(other.timeIdxInSeq), numFrames(other.numFrames) { }
-        void operator=(const FrameRange &);
-        void EnsureNotAllFrames() const
-        {
-            if (IsAllFrames())
-                LogicError("FrameRange::t() called when frame range refers to whole minibatch");
-        }
-    };
+    // the looping versions of EvaluateThisNode() and ComputeInputPartial() take a frame range, through this structure
+    // It can cast from a size_t, i.e. those functions can be called passing a size_t in place of the FrameRange.
+    // TODO: GetNumParallelSequences() should be subsumed here & removed from nodes
+    // TODO: Where this design currently breaks:
+    //  - BatchModeNodes must access GetNumParallelSequences(), yet operate on the whole sequence
+    //  - likewise, LSTMNode does its own iteration, hence needs access to GetNumParallelSequences() or NumCols() in the whole-batch iterator
+    //  - RecurrentNodes access frames with a time shift, where out-of-bounds ones access a different matrix' values
+    //  - RecurrentNodes iterate over individual slices--need a sub-setting constructor from a FrameRange to another?
+    //  - RecurrentNodes access boundary info with a similar pattern, but boundary info has a different #streams (namely, 1)
+    // TODO: Turns out, a FrameRange is either a whole batch or a single frame.
+    struct FrameRange
+    {
+        const size_t timeIdxInSeq;              // start frame
+        const size_t samplesInRecurrentStep;    // number of samples in this step       --BUGBUG: this should be part of MBLayout, not FrameRange
+        // can construct from a single size_t -> a single-frame range
+        //FrameRange(size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(0)/*FIX THIS*/{}
+        FrameRange(size_t timeIdxInSeq, size_t samplesInRecurrentStep) : timeIdxInSeq(timeIdxInSeq), samplesInRecurrentStep(samplesInRecurrentStep){}
+        // or without arguments -> entire minibatch / no frame-range
+        FrameRange() : timeIdxInSeq(0), samplesInRecurrentStep(SIZE_MAX) {}
+        // code that can only handle single-frame ranges will call t() to get the time index, which will throw if numFrames != 1
+        // Some functions need just the time index, e.g. for looking up stuff in m_boundaryInfo. That's where an unscaled index is needed (as opposed to startColumn()).
+        size_t t() const { EnsureNotAllFrames(); return timeIdxInSeq; }
+        // multi-frame slice case: these two get startFrame and numFrames
+        size_t StartColumn() const { EnsureNotAllFrames(); return timeIdxInSeq * samplesInRecurrentStep; }
+        size_t NumCols() const { EnsureNotAllFrames(); return samplesInRecurrentStep; }
+        bool IsAllFrames() const { return samplesInRecurrentStep == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
+    private:
+        FrameRange(const FrameRange & other);// : timeIdxInSeq(other.timeIdxInSeq), numFrames(other.numFrames) { }
+        void operator=(const FrameRange &);
+        void EnsureNotAllFrames() const
+        {
+            if (IsAllFrames())
+                LogicError("FrameRange::t() called when frame range refers to whole minibatch");
+        }
+    };
 
     enum CurrentDataLocation
     {
@@ -212,6 +212,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // TODO: all these scalars should be passed as doubles and cast down inside
         void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
         ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
+        void FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
         ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
 
         // TODO: should Reshape() return a new Matrix object that contains a reference to the original?
@@ -514,20 +515,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     typedef Matrix<float> SingleMatrix;
     typedef Matrix<double> DoubleMatrix;
-
-    // MBLayout -- layout information of minibatch
-    // Currently this is to bind the two somewhat inconsistent boundary flags and packing flags.
-    // Once that is unified, we can clean it up further. For now, it's just moving the data members and encapsulating access to them where possible.
-    // This should probably also contain m_actualNumParallelSequencesInEachRecIter (which should be node-dependent).
-    // TODO: move this to an appropriate place and name it properly
-    // NOTE: This class represents an abstraction of an originally distributed/code-duped way of defining and accessing the MB layout.
-    //       The code below represents the actual use cases I encountered. Not all are, I believe, needed to be as they are; this class could be simplified/streamlined much further.
-    //       Some wackiness below is explained by this.
-    // TODO: frame-randoized MBs are now represented as one stream of many frames. This is wrong; they should be one-frame utterances with many streams. Once we fully abstract out Data access, this can be changed easily.
-    struct MBLayout
-    {   
-        MBLayout() : m_sentenceBoundaryFlags(CPUDEVICE) { }
-    private:    // one day...
+
+    // MBLayout -- layout information of minibatch
+    // Currently this is to bind the two somewhat inconsistent boundary flags and packing flags.
+    // Once that is unified, we can clean it up further. For now, it's just moving the data members and encapsulating access to them where possible.
+    // This should probably also contain m_actualNumParallelSequencesInEachRecIter (which should be node-dependent).
+    // TODO: move this to an appropriate place and name it properly
+    // NOTE: This class represents an abstraction of an originally distributed/code-duped way of defining and accessing the MB layout.
+    //       The code below represents the actual use cases I encountered. Not all are, I believe, needed to be as they are; this class could be simplified/streamlined much further.
+    //       Some wackiness below is explained by this.
+    // TODO: frame-randoized MBs are now represented as one stream of many frames. This is wrong; they should be one-frame utterances with many streams. Once we fully abstract out Data access, this can be changed easily.
+    struct MBLayout
+    {   
+        MBLayout() : m_sentenceBoundaryFlags(CPUDEVICE) { }
+    private:    // one day...
         /// a matrix of n_stream x n_length
         /// n_stream is the number of streams
         /// n_length is the maximum lenght of each stream
@@ -540,96 +541,96 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /// the second data stream has two sentences, with 0 indicating begining of sentences
         /// you may use 1 even if a sentence begins at that position, in this case, the trainer will carry over hidden states to the following
         /// frame. 
-        Matrix<float> m_sentenceBoundaryFlags;  // (t,stream)
-        // ^^ float -> MinibatchPackingFlags, right? Or unsigned char; or change that to 'char' because Matrix<char> already exists
-        // This matrix ^^ is always in CPU memory  --TODO: should rather be a matrix of some int
-        /// conditionally point to either a pointer to that provided by network, or point to 
-        /// an individual sentence boundary info, which happens if timeStep > 1 is required for PastValue node
+        Matrix<float> m_sentenceBoundaryFlags;  // (t,stream)
+        // ^^ float -> MinibatchPackingFlags, right? Or unsigned char; or change that to 'char' because Matrix<char> already exists
+        // This matrix ^^ is always in CPU memory  --TODO: should rather be a matrix of some int
+        /// conditionally point to either a pointer to that provided by network, or point to 
+        /// an individual sentence boundary info, which happens if timeStep > 1 is required for PastValue node
         /// a matrix of 1 x n_length
         /// != 0 denotes the case that there exists sentence begin or no_labels case in this frame
         /// == 0 denotes such case is not in this frame
-        vector<MinibatchPackingFlags> m_minibatchPackingFlags;
-        // ^^ This is some form of aggregate of m_sentenceBoundaryFlags taken over all streams. TODO: find out the exact condition
-    public:
-
-        bool Is(size_t t, MinibatchPackingFlags f) const { return m_minibatchPackingFlags[t] & f; }
-        bool Is(size_t id, size_t t, MinibatchPackingFlags f) const { return ((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f; }
-
-        // get info for one frame; used in DelayedValueNode
-        // TODO: clean this up, we can do this more nicely
-        pair<Matrix<float>, MinibatchPackingFlags> GetFrame(size_t t) const
-        {
-            return make_pair(m_sentenceBoundaryFlags.ColumnSlice(t, 1), m_minibatchPackingFlags[t]);
-        }
-
-        // set a boundary flag
-        // This ORs the flags, i.e. it assumes that the matrix has been cleared before.
-        // NOTE: original code that calls this did not OR the matrix, but did OR the vector value. I visually checked that it was cleared before, but might have gotten it wrong.
-        void Set(size_t id, size_t t, MinibatchPackingFlags f)
-        {
-            m_sentenceBoundaryFlags.SetValue(id, t, (float)(((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) | f));
-            m_minibatchPackingFlags[t] |= f;
-        }
-        // same but not ORing  --TODO: is this distinction needed?
-        void Reset(size_t id, size_t t, MinibatchPackingFlags f)
-        {
-            m_sentenceBoundaryFlags.SetValue(id, t, (float)(int)f);
-            m_minibatchPackingFlags[t] |= f;
-        }
-        // needed in DelayedValueNodeBase
-        // TODO: this is wicked in that the matrix keeps only the NoLabel flag, while the vector keeps all (just gets ORed into)
-        void Mask(size_t id, size_t t, MinibatchPackingFlags f)
-        {
-            m_sentenceBoundaryFlags.SetValue(id, t, (float)(((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f));
-            //m_minibatchPackingFlags[t] &= f;
-        }
-
-        // for LSTMNode ony, which is deprecated, only to make it compile easily:  also used in FindBestPathWithVariableLength() and FindBestPath() in a strange way
-        Matrix<float> & GetM() { return m_sentenceBoundaryFlags; }
-        // and for DecimateMinibatchWithSentences() which should be revised
-        vector<MinibatchPackingFlags> & GetV() { return m_minibatchPackingFlags; }
-
-        // resize and reset all frames to None (note: this is an invalid state and must be fixed by caller afterwards)
-        void Resize(size_t numStreams, size_t numFrames)
-        {
-            m_sentenceBoundaryFlags.Resize(numStreams, numFrames);
-            m_sentenceBoundaryFlags.SetValue((float)((int)MinibatchPackingFlags::None));
-            m_minibatchPackingFlags.assign(m_sentenceBoundaryFlags.GetNumCols(), MinibatchPackingFlags::None);
-        }
-
-        // test a pre-condition  --TODO: we only resize this thing here, so this should not be necessary in the future
-        void validate() const { if (m_minibatchPackingFlags.size() != m_sentenceBoundaryFlags.GetNumCols()) LogicError("MBLayout: GetSize() != GetNumTimeSteps()"); }
-
-        // these accessors were for now just collected from actual usage; need to be cleaned up once this compiles again
-        size_t GetNumTimeSteps()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
-        size_t GetNumParallelSequences() const { return (m_sentenceBoundaryFlags.GetNumRows() == 0) ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
-        size_t GetSize() const { validate(); return m_minibatchPackingFlags.size(); }
-
-        // if we have no matrix/vector, this means no frame has any flag set
-        // We still can have a number of rows in this case.
-        bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
-        void SetAllNone() { Resize(0, 0); }
-
-#if 0   // we have this pattern often:
-        // TODO: mbSize and #slices must also move into MBLayout 
-        evalnet->SetActualMiniBatchSize(mbSize);
-        dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
-        evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
-#endif
-#if 0   // a VERY TELLING piece of code
-        // packing flags = frame-wise or over all streams of start and end
-        for (size_t nt = 0; nt < nMBSize; nt++)
-        {
-            for (size_t ns = 0; ns < nSlices; ns++)
-            {
-                if (newBoundary(ns, nt) == ((int) MinibatchPackingFlags::SequenceStart))
-                    pMBLayout->m_minibatchPackingFlags[nt] |= MinibatchPackingFlags::SequenceStart;
-                if (newBoundary(ns, nt) == ((int) MinibatchPackingFlags::SequenceEnd))
-                    pMBLayout->m_minibatchPackingFlags[nt] |= MinibatchPackingFlags::SequenceEnd;
-            }
-        }
-#endif
-    };
-    typedef std::shared_ptr<MBLayout> MBLayoutPtr;
-
+        vector<MinibatchPackingFlags> m_minibatchPackingFlags;
+        // ^^ This is some form of aggregate of m_sentenceBoundaryFlags taken over all streams. TODO: find out the exact condition
+    public:
+
+        bool Is(size_t t, MinibatchPackingFlags f) const { return m_minibatchPackingFlags[t] & f; }
+        bool Is(size_t id, size_t t, MinibatchPackingFlags f) const { return ((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f; }
+
+        // get info for one frame; used in DelayedValueNode
+        // TODO: clean this up, we can do this more nicely
+        pair<Matrix<float>, MinibatchPackingFlags> GetFrame(size_t t) const
+        {
+            return make_pair(m_sentenceBoundaryFlags.ColumnSlice(t, 1), m_minibatchPackingFlags[t]);
+        }
+
+        // set a boundary flag
+        // This ORs the flags, i.e. it assumes that the matrix has been cleared before.
+        // NOTE: original code that calls this did not OR the matrix, but did OR the vector value. I visually checked that it was cleared before, but might have gotten it wrong.
+        void Set(size_t id, size_t t, MinibatchPackingFlags f)
+        {
+            m_sentenceBoundaryFlags.SetValue(id, t, (float)(((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) | f));
+            m_minibatchPackingFlags[t] |= f;
+        }
+        // same but not ORing  --TODO: is this distinction needed?
+        void Reset(size_t id, size_t t, MinibatchPackingFlags f)
+        {
+            m_sentenceBoundaryFlags.SetValue(id, t, (float)(int)f);
+            m_minibatchPackingFlags[t] |= f;
+        }
+        // needed in DelayedValueNodeBase
+        // TODO: this is wicked in that the matrix keeps only the NoLabel flag, while the vector keeps all (just gets ORed into)
+        void Mask(size_t id, size_t t, MinibatchPackingFlags f)
+        {
+            m_sentenceBoundaryFlags.SetValue(id, t, (float)(((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f));
+            //m_minibatchPackingFlags[t] &= f;
+        }
+
+        // for LSTMNode ony, which is deprecated, only to make it compile easily:  also used in FindBestPathWithVariableLength() and FindBestPath() in a strange way
+        Matrix<float> & GetM() { return m_sentenceBoundaryFlags; }
+        // and for DecimateMinibatchWithSentences() which should be revised
+        vector<MinibatchPackingFlags> & GetV() { return m_minibatchPackingFlags; }
+
+        // resize and reset all frames to None (note: this is an invalid state and must be fixed by caller afterwards)
+        void Resize(size_t numStreams, size_t numFrames)
+        {
+            m_sentenceBoundaryFlags.Resize(numStreams, numFrames);
+            m_sentenceBoundaryFlags.SetValue((float)((int)MinibatchPackingFlags::None));
+            m_minibatchPackingFlags.assign(m_sentenceBoundaryFlags.GetNumCols(), MinibatchPackingFlags::None);
+        }
+
+        // test a pre-condition  --TODO: we only resize this thing here, so this should not be necessary in the future
+        void validate() const { if (m_minibatchPackingFlags.size() != m_sentenceBoundaryFlags.GetNumCols()) LogicError("MBLayout: GetSize() != GetNumTimeSteps()"); }
+
+        // these accessors were for now just collected from actual usage; need to be cleaned up once this compiles again
+        size_t GetNumTimeSteps()  const { validate(); return m_sentenceBoundaryFlags.GetNumCols(); }
+        size_t GetNumParallelSequences() const { return (m_sentenceBoundaryFlags.GetNumRows() == 0) ? 1 : m_sentenceBoundaryFlags.GetNumRows(); }   // 1 stream if no matrix
+        size_t GetSize() const { validate(); return m_minibatchPackingFlags.size(); }
+
+        // if we have no matrix/vector, this means no frame has any flag set
+        // We still can have a number of rows in this case.
+        bool IsAllNone() const { validate(); return m_minibatchPackingFlags.empty(); }
+        void SetAllNone() { Resize(0, 0); }
+
+#if 0   // we have this pattern often:
+        // TODO: mbSize and #slices must also move into MBLayout 
+        evalnet->SetActualMiniBatchSize(mbSize);
+        dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
+        evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
+#endif
+#if 0   // a VERY TELLING piece of code
+        // packing flags = frame-wise or over all streams of start and end
+        for (size_t nt = 0; nt < nMBSize; nt++)
+        {
+            for (size_t ns = 0; ns < nSlices; ns++)
+            {
+                if (newBoundary(ns, nt) == ((int) MinibatchPackingFlags::SequenceStart))
+                    pMBLayout->m_minibatchPackingFlags[nt] |= MinibatchPackingFlags::SequenceStart;
+                if (newBoundary(ns, nt) == ((int) MinibatchPackingFlags::SequenceEnd))
+                    pMBLayout->m_minibatchPackingFlags[nt] |= MinibatchPackingFlags::SequenceEnd;
+            }
+        }
+#endif
+    };
+    typedef std::shared_ptr<MBLayout> MBLayoutPtr;
+
 }}}

From 5d1c0ce0416a892e08d3f62b28fba2e0e36cb569 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Wed, 16 Sep 2015 15:46:09 -0700
Subject: [PATCH 31/44] Merged FSAdaGrad with lastest master changes

---
 MachineLearning/CNTKSGDLib/SGD.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MachineLearning/CNTKSGDLib/SGD.h b/MachineLearning/CNTKSGDLib/SGD.h
index c27cd405c4c5..cbf3717cd891 100644
--- a/MachineLearning/CNTKSGDLib/SGD.h
+++ b/MachineLearning/CNTKSGDLib/SGD.h
@@ -43,7 +43,8 @@ enum class GradientsUpdateType : int
 {
     None,
     AdaGrad,
-    RmsProp
+    RmsProp,
+    FSAdaGrad
 };
 
 // TODO: While currently combining these methods is not supported,

From 87e6adaef19f96e84e305aefb830bf5f2923cba5 Mon Sep 17 00:00:00 2001
From: Alexey Kamenev <alexeyk@microsoft.com>
Date: Mon, 21 Sep 2015 14:48:31 -0700
Subject: [PATCH 32/44] Fix merge issues.

---
 MachineLearning/CNTKSGDLib/SGD.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index 6bcaa2b9bd61..5133bca2d77b 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -2402,7 +2402,7 @@ template<class ElemType>
         }
         else if (adpType == GradientsUpdateType::FSAdaGrad)
         {
-            smoothedGradient.FSAdagrad(actualMBSize, gradientValues, functionValues, learnRatePerSample, momentum);
+            smoothedGradient.FSAdagrad(actualMBSize, gradientValues, functionValues, (ElemType)learnRatePerSample, (ElemType)momentum);
         }
         else if (adpType == GradientsUpdateType::RmsProp)
         {

From 188f392d9ae987b310d5605409956d1f9c439ee2 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 16:08:47 -0700
Subject: [PATCH 33/44] renamed m_nodesReqMultiSeqHandling to
 m_requestNodesMultiSeqHandling, likewise SetRequestNodesMultiSeqHandling(),
 to indicate that this is a user request (as opposed to req-uired)

---
 MachineLearning/CNTK/ModelEditLanguage.cpp     |  2 +-
 MachineLearning/CNTK/NDLUtil.h                 |  2 +-
 .../CNTK/SynchronousExecutionEngine.h          |  2 +-
 .../ComputationNetwork.cpp                     | 18 ++++++++++--------
 .../ComputationNetwork.h                       | 16 ++++++++--------
 .../ComputationNode.h                          | 12 ++++++++----
 .../InputAndParamNodes.h                       |  2 +-
 .../NetworkBuilderFromConfig.cpp               |  2 +-
 .../CNTKComputationNetworkLib/RecurrentNodes.h |  5 ++---
 .../TrainingCriterionNodes.h                   |  6 +++---
 Math/Math/Matrix.h                             |  4 ++--
 11 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/MachineLearning/CNTK/ModelEditLanguage.cpp b/MachineLearning/CNTK/ModelEditLanguage.cpp
index 6c700133ebf1..ed4b2ed234ad 100644
--- a/MachineLearning/CNTK/ModelEditLanguage.cpp
+++ b/MachineLearning/CNTK/ModelEditLanguage.cpp
@@ -472,7 +472,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                 case melPropMultiSeqHandling:
                 {
                     bool set = params[2];
-                    SetProperty(node, cn->NodesReqMultiSeqHandling(), set);
+                    SetProperty(node, cn->RequestNodesMultiSeqHandling(), set);
                     break;
                 }
                 case melPropEvaluation:
diff --git a/MachineLearning/CNTK/NDLUtil.h b/MachineLearning/CNTK/NDLUtil.h
index e0a7dd1b47cb..b4435f2dd812 100644
--- a/MachineLearning/CNTK/NDLUtil.h
+++ b/MachineLearning/CNTK/NDLUtil.h
@@ -178,7 +178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CheckOutputNodes(script, "FeatureNodes", m_net->FeatureNodes());
             CheckOutputNodes(script, "LabelNodes", m_net->LabelNodes());
             CheckOutputNodes(script, "CriteriaNodes", m_net->FinalCriterionNodes());
-            CheckOutputNodes(script, "NodesReqMultiSeqHandling", m_net->NodesReqMultiSeqHandling());
+            CheckOutputNodes(script, "NodesReqMultiSeqHandling", m_net->RequestNodesMultiSeqHandling());
             CheckOutputNodes(script, "EvalNodes", m_net->EvaluationNodes());
             CheckOutputNodes(script, "OutputNodes", m_net->OutputNodes());
         }
diff --git a/MachineLearning/CNTK/SynchronousExecutionEngine.h b/MachineLearning/CNTK/SynchronousExecutionEngine.h
index 195c57643d61..43f82c38c436 100644
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@@ -288,7 +288,7 @@ class SynchronousNodeEvaluator : public NDLNodeEvaluator<ElemType>
             }
             else if (!_stricmp(value.c_str(), "multiseq"))
             {
-                SetOutputNode(m_net.NodesReqMultiSeqHandling(), compNode);
+                SetOutputNode(m_net.RequestNodesMultiSeqHandling(), compNode);
             }
             else if (!_strnicmp(value.c_str(), "eval", 4)) // only compare the first 4 characters
             {
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index cbcb2ed1ee95..eb81d56579b7 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -126,9 +126,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECriteriaNodes");
 
         fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BNodesReqMultiSeqHandling");
-        fstream << m_nodesReqMultiSeqHandling.size();
-        for (size_t i = 0; i<m_nodesReqMultiSeqHandling.size(); i++)
-            fstream << m_nodesReqMultiSeqHandling[i]->NodeName();
+        fstream << m_requestNodesMultiSeqHandling.size();
+        for (size_t i = 0; i<m_requestNodesMultiSeqHandling.size(); i++)
+            fstream << m_requestNodesMultiSeqHandling[i]->NodeName();
         fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling");
 
         fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BEvalNodes");
@@ -326,11 +326,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return false;
     }
 
+    // Some nodes always need m_reqMultiSeqHandling; those set it themselves. Basically RecurrentNode only currently (besides PairNode and LSTMNode).
+    // Some nodes need it to be set xxx.
     // TODO: comment on who owns this flag. Is it entirely owned by Network?
     // Or should the 4 node types below know?
-    void ComputationNetwork::SetNodesReqMultiSeqHandling()
+    void ComputationNetwork::SetRequestNodesMultiSeqHandling()
     {
-        for (auto & node : m_nodesReqMultiSeqHandling)
+        for (auto & node : m_requestNodesMultiSeqHandling)  // this set is defined in NDL; here we propagate that into the actual nodes' flags, except for a few where it makes no sense (avoid user error)
         {
             //SumElements node will generate a scalar value and so it should never require special handling
             //TransposeNode will change the size of columns and so it should also not included for special handling
@@ -1018,7 +1020,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (size_t i = 0; i<num; i++)
                 {
                     fstream >> nodeName;
-                    m_nodesReqMultiSeqHandling.push_back(GetNodeFromName(nodeName));
+                    m_requestNodesMultiSeqHandling.push_back(GetNodeFromName(nodeName));
                 }
                 fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling");
             }
@@ -1160,7 +1162,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // critera
         fstream << FormSpecialNodes(dotcfg.m_CriteriaStyle, m_finalCriteria);
         // nodes that requires multi sequence handling 
-        fstream << FormSpecialNodes(dotcfg.m_nodesReqMultiSeqHandlingStyle, m_nodesReqMultiSeqHandling);            
+        fstream << FormSpecialNodes(dotcfg.m_nodesReqMultiSeqHandlingStyle, m_requestNodesMultiSeqHandling);            
         // pre-compute nodes
         fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, PreComputedNodes);
         // PastValue nodes
@@ -1203,7 +1205,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         line.clear();
         for (const auto & x : m_finalCriteria)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
-        for (const auto & x : m_nodesReqMultiSeqHandling)
+        for (const auto & x : m_requestNodesMultiSeqHandling)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
         for (const auto & x : m_outputNodes)
             line = line + msra::strfun::wstrprintf(L"\"%ls\" ", x->GetName().c_str());
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index afe489328c04..b679dca2b110 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -529,7 +529,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     bool IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr);
 
-    void SetNodesReqMultiSeqHandling();
+    void SetRequestNodesMultiSeqHandling();
 
     // MAIN ENTRY POINT for evaluation (forward prop)
     // TODO: pass a set of nodes instead of only one
@@ -876,10 +876,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         return std::vector<ComputationNodeBasePtr> { node };
     }
 
-    inline std::vector<ComputationNodeBasePtr> & NodesReqMultiSeqHandling() { return m_nodesReqMultiSeqHandling; }
-    inline std::vector<ComputationNodeBasePtr> & EvaluationNodes()          { return m_evalNodes; }
-    inline std::vector<ComputationNodeBasePtr> & OutputNodes()              { return m_outputNodes; }
-    inline std::vector<ComputationNodeBasePtr> & PairNodes()                { return m_pairNodes; }
+    inline std::vector<ComputationNodeBasePtr> & RequestNodesMultiSeqHandling() { return m_requestNodesMultiSeqHandling; }  // user-specified list 'NodesReqMultiSeqHandling' (NDL and MEL create/modify this list)
+    inline std::vector<ComputationNodeBasePtr> & EvaluationNodes()              { return m_evalNodes; }
+    inline std::vector<ComputationNodeBasePtr> & OutputNodes()                  { return m_outputNodes; }
+    inline std::vector<ComputationNodeBasePtr> & PairNodes()                    { return m_pairNodes; }
 
     inline std::vector<RecurrentInfo> & RecurrentNodes() { return m_recurrentInfo; }
 
@@ -1181,7 +1181,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         CollectInputAndLearnableParameters(rootNode);
 
         //
-        SetNodesReqMultiSeqHandling();
+        SetRequestNodesMultiSeqHandling();
     }
 
     //this function will need to be called before actual validation and execution to 
@@ -1549,10 +1549,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     std::vector<ComputationNodeBasePtr> m_evalNodes;
     std::vector<ComputationNodeBasePtr> m_outputNodes;
     std::vector<ComputationNodeBasePtr> m_pairNodes; /// nodes for the children network to pair
-    std::vector<ComputationNodeBasePtr> m_nodesReqMultiSeqHandling;
+    std::vector<ComputationNodeBasePtr> m_requestNodesMultiSeqHandling;
     vector<std::vector<ComputationNodeBasePtr>*> GetAllNodeGroups()    // get all groups to allow to iterate over all of them ...continue
     {
-        return vector<std::vector<ComputationNodeBasePtr>*> { &m_features, &m_labels, &m_finalCriteria, &m_evalNodes, &m_outputNodes, &m_pairNodes, &m_nodesReqMultiSeqHandling };
+        return vector<std::vector<ComputationNodeBasePtr>*> { &m_features, &m_labels, &m_finalCriteria, &m_evalNodes, &m_outputNodes, &m_pairNodes, &m_requestNodesMultiSeqHandling };
     }
 
     std::vector<RecurrentInfo> m_recurrentInfo;     // [index--TODO: comment what this is indexed with]
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 71e06618c46e..47e798ef783c 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -625,6 +625,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         DEVICEID_TYPE m_deviceId; //CPU=-1, >=0 GPU
         bool m_needGradient;  //only used for leaf, i.e., learnable parameters, etc.
         bool m_reqMultiSeqHandling;  // indicates whether the results of operation should be masked to handle the cases that the utterances have different lengths when grouped together as a minibatch.
+        // ^^ This decides whether the node gets passed the full layout with flags or only the one without flags
+        //    and this is only ever tested in MaskToZeroWhenLabelAndFeatureMissing(), of which two versions exist, one in ComputationNode and one in ClassBasedCrossEntropyWithSoftmaxNode
+        // TODO: rename this to reflect that it affects only masking
         size_t m_inputWidth, m_inputHeight, m_inputChannels;  //how to interpret each column in the input as an image
         size_t m_outputWidth, m_outputHeight, m_outputChannels;  //how to interpret each column in the output as an image
 
@@ -806,7 +809,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             EvaluateThisNode();     // this is a call to the virtual function that implements the actual operation
 
-            if (!UseCustomizedMultiSeqHandling())
+            if (!UseCustomizedMultiSeqHandling())       // this means the node does it by itself; if not, we do it for the node
                 MaskToZeroWhenLabelAndFeatureMissing(m_functionValues);
         }
 
@@ -857,7 +860,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
 
-            if (m_pMBLayout && !m_pMBLayout->IsAllNone())
+            if (!m_pMBLayout->IsAllNone())
             {
                 size_t nT = matrixToBeMasked.GetNumCols();
                 size_t nS = m_pMBLayout->GetNumParallelSequences();
@@ -871,10 +874,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 {
                     size_t t = utt_t / nS;
 
-                    if (m_pMBLayout->Is(t, MinibatchPackingFlags::NoLabel))
+                    if (m_pMBLayout->Is(t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeatures))
                     {
                         for (size_t id = 0; id < nS; id++)
-                            if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel))
+                            if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeatures))
                                 matrixToBeMasked.ColumnSlice(utt_t+id, 1).SetValue(0);
                         processedExistsNoLabelorFeatureMissing = true;
                     }
@@ -1212,6 +1215,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void SetErrorsFromFutureMinibatch(Matrix<ElemType>&) {}
 
         // indicatess whether special handling is needed.The standard handleing will be just mask the function values after the evalaution and mask the gradient before gradiant computation for the children. this is not valid for all criterion nodes whose result is a scalar.
+        // defined by training/eval criteria (and the soon-to-be-deprecated PairNode, LSTMNode)
         virtual bool UseCustomizedMultiSeqHandling() { return false; }
 
     protected:
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index aa29b045369a..8913a71f9f94 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -530,7 +530,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
         void Init(size_t row_size, size_t col_size)
         {
-            m_reqMultiSeqHandling = true;
+            SetReqMultiSeqHandlingTo(true);
             m_functionValues.Resize(row_size, col_size);
         }
     public:
diff --git a/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp b/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
index d2d04b7ad0d7..c20b83c758e4 100644
--- a/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/NetworkBuilderFromConfig.cpp
@@ -677,7 +677,7 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
                 else if (!_wcsnicmp(tag.c_str(), L"eval", 4))       net->EvaluationNodes().push_back(node);     // eval*
                 else if (tag == L"output")                          net->OutputNodes().push_back(node);
                 else if (tag == L"pair")                            net->PairNodes().push_back(node);           // TODO: I made this up; the original code in SynchronousExecutionEngine did not have this
-                else if (tag == L"multiseq")                        net->NodesReqMultiSeqHandling().push_back(node);
+                else if (tag == L"multiseq")                        net->RequestNodesMultiSeqHandling().push_back(node);
                 else if (!tag.empty())
                     RuntimeError("ComputationNetwork: unknown tag '%ls'", tag.c_str());
                 // TODO: are there nodes without tag? Where do they go?
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 9cdf2f4accd4..cba78b3c271d 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -36,7 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     private:
         void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
         {
-            m_reqMultiSeqHandling = true;
+            SetReqMultiSeqHandlingTo(true);
             m_initialActivationValue = initialActivationValue;
             m_timeStep = 1;
             m_functionValues.Resize(row_size, col_size);
@@ -517,7 +517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_use_errors_from_future_minibatch(false),
             m_DefaultState((ElemType)DEFAULT_HIDDEN_ACTIVATION)
         {
-            m_reqMultiSeqHandling = true;
+            SetReqMultiSeqHandlingTo(true);
         }
 
         virtual const std::wstring OperationName() const { return TypeName(); }
@@ -561,7 +561,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 node->m_use_errors_from_future_minibatch = m_use_errors_from_future_minibatch;
 
                 node->m_DefaultState = m_DefaultState;
-                node->m_reqMultiSeqHandling = m_reqMultiSeqHandling;
             }
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index ab04316ca667..c0a64c77743d 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -1043,7 +1043,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Matrix<ElemType> softMax_t = softMax.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> logSoftMax_t = logSoftmax.ColumnSlice(sz, nbr_wrd);
 
-                if (curNode->MaskToZeroWhenLabelAndFeatureMissing(logSoftMax_t, t) == false)
+                if (!curNode->MaskToZeroWhenLabelAndFeatureMissing(logSoftMax_t, t))
                 {
                     Matrix<ElemType> obs = inputs.ColumnSlice(t, 1);  /// e.g., 200 x 1
                     obs.Reshape(1, nRow);  /// 1 x 200
@@ -1065,7 +1065,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
 
                 /// add the class log posterior probability
-                if (curNode->MaskToZeroWhenLabelAndFeatureMissing(clsLogSoftmax, t) == false)
+                if (!curNode->MaskToZeroWhenLabelAndFeatureMissing(clsLogSoftmax, t))
                 {
                     try{
                         Matrix<ElemType>::AddElementToElement(clsLogSoftmax, c_t, t, functionValues, 0, 0);
@@ -1095,7 +1095,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 
-            if (m_pMBLayout && !m_pMBLayout->IsAllNone())
+            if (!m_pMBLayout->IsAllNone())
             {
                 // 't' is not a time but rather a column index that encodes (time stamp, stream)
                 size_t nS = m_pMBLayout->GetNumParallelSequences();
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 75c4798eb181..0185c4ebebe9 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -503,8 +503,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // ^^ This is some form of aggregate of m_sentenceBoundaryFlags taken over all streams. TODO: find out the exact condition
     public:
 
-        bool Is(size_t t, MinibatchPackingFlags f) const { return m_minibatchPackingFlags[t] & f; }
-        bool Is(size_t id, size_t t, MinibatchPackingFlags f) const { return ((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f; }
+        bool Is(size_t t, MinibatchPackingFlags f) const { return (m_minibatchPackingFlags[t] & f) != 0; }
+        bool Is(size_t id, size_t t, MinibatchPackingFlags f) const { return (((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f) != 0; }
 
         // get info for one frame; used in DelayedValueNode
         // TODO: clean this up, we can do this more nicely

From b72a1eb9ee07662d78330bda966b7e62a28a20d9 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 16:47:41 -0700
Subject: [PATCH 34/44] renamed MaskToZeroWhenLabelAndFeatureMissing() to
 MaskMissingColumnsToZero() and commented what it does; and merged it with
 ClassBasedCrossEntropyWithSoftmaxNode::MaskToZeroWhenLabelAndFeatureMissing()
 which was nearly identical

---
 .../ComputationNode.h                         | 47 +++++++++++--------
 .../EvaluationCriterionNodes.h                |  4 +-
 .../TrainingCriterionNodes.h                  | 40 ++++++++--------
 Math/Math/Matrix.h                            |  1 +
 4 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 47e798ef783c..09575913da3e 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -127,7 +127,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // evaluate only N frames at time index timeIdxInSeq
         // Normally, N is 1 or it spans the entire minibatch.
         virtual void EvaluateThisNode(const FrameRange &) = 0;
-        // evaluate a node--this calls EvaluateThisNode() and MaskToZeroWhenLabelAndFeatureMissing() if needed
+        // evaluate a node--this calls EvaluateThisNode() and MaskMissingColumnsToZero() if needed
         // this is the main entry point for Network; while EvaluateThisNode() is the virtual call into specific node implementation
         virtual void EvaluateThisNodeGivenInputs() = 0;
         virtual void EvaluateThisNodeGivenInputs(const size_t timeIdxInSeq) = 0; // TODO: change to FrameRange as well
@@ -626,7 +626,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_needGradient;  //only used for leaf, i.e., learnable parameters, etc.
         bool m_reqMultiSeqHandling;  // indicates whether the results of operation should be masked to handle the cases that the utterances have different lengths when grouped together as a minibatch.
         // ^^ This decides whether the node gets passed the full layout with flags or only the one without flags
-        //    and this is only ever tested in MaskToZeroWhenLabelAndFeatureMissing(), of which two versions exist, one in ComputationNode and one in ClassBasedCrossEntropyWithSoftmaxNode
+        //    and this is only ever tested in MaskMissingColumnsToZero(), of which two versions exist, one in ComputationNode and one in ClassBasedCrossEntropyWithSoftmaxNode
         // TODO: rename this to reflect that it affects only masking
         size_t m_inputWidth, m_inputHeight, m_inputChannels;  //how to interpret each column in the input as an image
         size_t m_outputWidth, m_outputHeight, m_outputChannels;  //how to interpret each column in the output as an image
@@ -810,7 +810,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             EvaluateThisNode();     // this is a call to the virtual function that implements the actual operation
 
             if (!UseCustomizedMultiSeqHandling())       // this means the node does it by itself; if not, we do it for the node
-                MaskToZeroWhenLabelAndFeatureMissing(m_functionValues);
+                MaskMissingColumnsToZero(m_functionValues);
         }
 
         // TODO: use a FrameRange arg, then unify with above
@@ -821,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             EvaluateThisNode(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
             if (!UseCustomizedMultiSeqHandling())
-                MaskToZeroWhenLabelAndFeatureMissing(m_functionValues, timeIdxInSeq);
+                MaskMissingColumnsToZero(m_functionValues, timeIdxInSeq);
         }
 
 #if 0   // (this function cannot be used currently since sentenceBegin is not a Matrix<ElemType> anymore; only affects LSTMNode which is no longer used)
@@ -855,30 +855,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /**
         reset to error signals to 0 for any elements without labels
         */
-        // TODO: use a FrameRange instead of timeIdxSeq
-        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t timeIdxInSeq=(size_t)-1) const
+        // This sets MB columns to 0 that have the NoLabel or NoFeature flag set.
+        // This happens as a result of packing multiple sequences for parallel processing--there will be some gaps, which are flagged by these flags.
+        // Nodes that operate in 'map' style (input(j) -> output(j) independently) can ignore this; it will be garbage-in-garbage-out.
+        // However, nodes that 'reduce' minibatches (e.g. computing the sum of all frames across all sequences) must deal with the garbage.
+        // This function sets those to 0, assuming that now they can be reduced without affecting the result.
+        // This function can operate on the whole range or on a selected single frame and/or a single sequence.
+        bool MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, size_t timeIdxInSeq = SIZE_MAX, size_t seqIndex = SIZE_MAX) const
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
 
             if (!m_pMBLayout->IsAllNone())
             {
-                size_t nT = matrixToBeMasked.GetNumCols();
+                size_t nT = m_pMBLayout->GetNumTimeSteps();
                 size_t nS = m_pMBLayout->GetNumParallelSequences();
 
-                if (m_pMBLayout->GetSize() != nT / nS)
-                    LogicError("MaskToZeroWhenLabelAndFeatureMissing: m_pMBLayout->m_minibatchPackingFlags should have one element for each timestep of all streams. Check feature reader. ");
+                if (matrixToBeMasked.GetNumCols() != nT * nS)
+                    LogicError("MaskMissingColumnsToZero: m_pMBLayout->m_minibatchPackingFlags should have one element for each timestep of all streams. Check feature reader. ");
 
-                size_t startT = (timeIdxInSeq == (size_t)-1) ? 0 : timeIdxInSeq * nS;       // TODO: misnomer; startT, endT, and utt_t are not times but columns in the packed matrix
-                size_t endT = (timeIdxInSeq == (size_t)-1) ? nT : timeIdxInSeq * nS + nS;
-                for (size_t utt_t = startT; utt_t < endT; utt_t += nS)
-                {
-                    size_t t = utt_t / nS;
+                size_t startT = (timeIdxInSeq == SIZE_MAX) ?  0 : timeIdxInSeq;
+                size_t endT   = (timeIdxInSeq == SIZE_MAX) ? nT : timeIdxInSeq + 1;
+
+                size_t startS = (seqIndex == SIZE_MAX) ?  0 : seqIndex;
+                size_t endS   = (seqIndex == SIZE_MAX) ? nS : seqIndex + 1;
 
-                    if (m_pMBLayout->Is(t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeatures))
+                for (size_t t = startT; t < endT; t++)
+                {
+                    if (m_pMBLayout->Is(t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeature))
                     {
-                        for (size_t id = 0; id < nS; id++)
-                            if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeatures))
-                                matrixToBeMasked.ColumnSlice(utt_t+id, 1).SetValue(0);
+                        for (size_t id = startS; id < endS; id++)
+                            if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeature))
+                                matrixToBeMasked.ColumnSlice(t * nS  +  id, 1).SetValue(0);
                         processedExistsNoLabelorFeatureMissing = true;
                     }
                 }
@@ -1050,7 +1057,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (size_t i=0; i<m_children.size(); i++)
             {
                 if (!UseCustomizedMultiSeqHandling())
-                    MaskToZeroWhenLabelAndFeatureMissing(m_gradientValues);
+                    MaskMissingColumnsToZero(m_gradientValues);
 
                 ComputationNodePtr child = Inputs(i);
                 if (child->NeedGradient())
@@ -1079,7 +1086,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             for (size_t i=0; i<m_children.size(); i++)
             {
                 if (!UseCustomizedMultiSeqHandling())
-                    MaskToZeroWhenLabelAndFeatureMissing(m_gradientValues, timeIdxInSeq);
+                    MaskMissingColumnsToZero(m_gradientValues, timeIdxInSeq);
 
                 ComputationNodePtr child = Inputs(i);
                 if (child->NeedGradient())
diff --git a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
index a60ea5c604e1..a765ca6d4a8c 100644
--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@@ -48,8 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             inputFunctionValues0.VectorMax(maxIndexes0, maxValues, true);
             inputFunctionValues1.VectorMax(maxIndexes1, maxValues, true);
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(maxIndexes0); //we are fine since it will only be called with full minibatch
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(maxIndexes1);
+            curNode->MaskMissingColumnsToZero(maxIndexes0); //we are fine since it will only be called with full minibatch
+            curNode->MaskMissingColumnsToZero(maxIndexes1);
             functionValues.AssignNumOfDiff(maxIndexes0, maxIndexes1);
         #if NANCHECK
             functionValues.HasNan("ErrorPrediction");
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index c0a64c77743d..a13126e33628 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -60,7 +60,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& leftMinusRight, ComputationNodePtr curNode)  
         {
             leftMinusRight.AssignDifferenceOf(inputFunctionValues0, inputFunctionValues1);
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(leftMinusRight);  //we are fine since it will only be called with full minibatch.
+            curNode->MaskMissingColumnsToZero(leftMinusRight);  //we are fine since it will only be called with full minibatch.
             ElemType v = leftMinusRight.FrobeniusNorm();
             functionValues.Resize(1,1);
             functionValues.SetValue(v*v/2);
@@ -174,7 +174,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             else
             {
                 ComputeInputPartialRight(m_softmaxOfRight, Inputs(0)->FunctionValues(), Inputs(inputIndex)->GradientValues(), GradientValues());
-                Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(inputIndex)->GradientValues());
+                Base::MaskMissingColumnsToZero(Inputs(inputIndex)->GradientValues());
             }
         }
 
@@ -221,7 +221,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             logSoftmaxOfRight.AssignLogSoftmaxOf(inputFunctionValues1, true);
             softmaxOfRight.SetValue(logSoftmaxOfRight);
             softmaxOfRight.InplaceExp();
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(logSoftmaxOfRight); //we are fine here since it will be called only with full minibatch
+            curNode->MaskMissingColumnsToZero(logSoftmaxOfRight); //we are fine here since it will be called only with full minibatch
             functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logSoftmaxOfRight);
             functionValues*=(-1);
 #if NANCHECK
@@ -363,7 +363,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, ComputationNodePtr curNode)
         {
             leftDivRight.AssignElementDivisionOf(inputFunctionValues0, inputFunctionValues1);
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(leftDivRight);
+            curNode->MaskMissingColumnsToZero(leftDivRight);
             Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), leftDivRight, inputGradientValues);
         }
 
@@ -377,7 +377,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             logOfRight.SetValue(inputFunctionValues1);
             logOfRight.InplaceLog();
-            curNode->MaskToZeroWhenLabelAndFeatureMissing(logOfRight);
+            curNode->MaskMissingColumnsToZero(logOfRight);
             functionValues.AssignInnerProductOfMatrices(inputFunctionValues0, logOfRight);
             functionValues*=(-1);
 #if NANCHECK
@@ -503,7 +503,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void EvaluateThisNode()  
         {
-            Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
+            Base::MaskMissingColumnsToZero(Inputs(0)->FunctionValues());
             EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
         }
 
@@ -599,7 +599,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void EvaluateThisNode()  
         {
-            Base::MaskToZeroWhenLabelAndFeatureMissing(Inputs(0)->FunctionValues());
+            Base::MaskMissingColumnsToZero(Inputs(0)->FunctionValues());
             EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues());
         }
 
@@ -1043,7 +1043,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 Matrix<ElemType> softMax_t = softMax.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> logSoftMax_t = logSoftmax.ColumnSlice(sz, nbr_wrd);
 
-                if (!curNode->MaskToZeroWhenLabelAndFeatureMissing(logSoftMax_t, t))
+                if (!curNode->MaskMissingColumnsToZero(logSoftMax_t, t))
                 {
                     Matrix<ElemType> obs = inputs.ColumnSlice(t, 1);  /// e.g., 200 x 1
                     obs.Reshape(1, nRow);  /// 1 x 200
@@ -1065,7 +1065,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
 
                 /// add the class log posterior probability
-                if (!curNode->MaskToZeroWhenLabelAndFeatureMissing(clsLogSoftmax, t))
+                if (!curNode->MaskMissingColumnsToZero(clsLogSoftmax, t))
                 {
                     try{
                         Matrix<ElemType>::AddElementToElement(clsLogSoftmax, c_t, t, functionValues, 0, 0);
@@ -1090,28 +1090,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         /**
         reset to error signals to 0 for any elements without labels
         */
-        // TODO: This has overlap with ComputationNode::MaskToZeroWhenLabelAndFeatureMissing(), should call that instead. Note: This one does only one stream, while Base:: one does all streams.
-        bool MaskToZeroWhenLabelAndFeatureMissing(Matrix<ElemType>& matrixToBeMasked, const size_t t) const
-        {
+        // BUGBUG: the layout should be that of matrixToBeMasked, not of 'this'
+        bool MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, const size_t j) const
+        {
+            size_t nS = m_pMBLayout->GetNumParallelSequences();
+            size_t t = j / nS;  // this is the time stamp
+            size_t id = j % nS;  // this is the stream
+            return Base::MaskMissingColumnsToZero(matrixToBeMasked, t, id);
+#if 0       // old version prior to merging with Base version
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 
             if (!m_pMBLayout->IsAllNone())
             {
-                // 't' is not a time but rather a column index that encodes (time stamp, stream)
-                size_t nS = m_pMBLayout->GetNumParallelSequences();
-                size_t j = t / nS;  // this is the time stamp
-                size_t i = t % nS;  // this is the stream
-                if (m_pMBLayout->Is(j, MinibatchPackingFlags::NoLabel)) // TODO: this outer test is redundant here, no?
+                if (m_pMBLayout->Is(t, MinibatchPackingFlags::NoLabel)) // TODO: this outer test is redundant here, no?
                 {
-                    if (m_pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
+                    if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel))
                     {
-                        matrixToBeMasked.ColumnSlice(t,1).SetValue(0);
+                        matrixToBeMasked.ColumnSlice(t * nS + id,1).SetValue(0);
                         processedExistsNoLabelorFeatureMissing = true;
                     }
                 }
             }
 
             return processedExistsNoLabelorFeatureMissing;
+#endif
         }
 
         /**
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 0185c4ebebe9..75aba9597a1d 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -504,6 +504,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     public:
 
         bool Is(size_t t, MinibatchPackingFlags f) const { return (m_minibatchPackingFlags[t] & f) != 0; }
+        // TODO: swap id and t; t is the more important parameter
         bool Is(size_t id, size_t t, MinibatchPackingFlags f) const { return (((MinibatchPackingFlags)(int)m_sentenceBoundaryFlags(id, t)) & f) != 0; }
 
         // get info for one frame; used in DelayedValueNode

From c17079ed5f1bb2bfacfbd287114bcecc1cbaa59d Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 17:18:09 -0700
Subject: [PATCH 35/44] more renaming for clarity regarding masking:
 SetReqMultiSeqHandlingTo(true) -> SetMaskMissingColumnsToZero(),
 ReqMultiSeqHandling() -> NeedToMaskMissingColumnsToZero(), likewise
 m_maskMissingColumnsToZero

---
 .../ComputationNetwork.cpp                    | 13 ++++++-------
 .../ComputationNetwork.h                      |  6 +++++-
 .../ComputationNode.h                         | 19 +++++++++++--------
 .../InputAndParamNodes.h                      |  2 +-
 .../RecurrentNodes.h                          |  4 ++--
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index eb81d56579b7..42c73d02263a 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -326,10 +326,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return false;
     }
 
-    // Some nodes always need m_reqMultiSeqHandling; those set it themselves. Basically RecurrentNode only currently (besides PairNode and LSTMNode).
-    // Some nodes need it to be set xxx.
-    // TODO: comment on who owns this flag. Is it entirely owned by Network?
-    // Or should the 4 node types below know?
+    // transfer user-specified request for masking to the indivudal nodes
+    // This is only needed if users explicitly perform reduce-like operations.
+    // It makes no sense for some nodes, so we skip those.
     void ComputationNetwork::SetRequestNodesMultiSeqHandling()
     {
         for (auto & node : m_requestNodesMultiSeqHandling)  // this set is defined in NDL; here we propagate that into the actual nodes' flags, except for a few where it makes no sense (avoid user error)
@@ -342,18 +341,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 node->OperationName() != OperationNameOf(MeanNode) &&
                 node->OperationName() != OperationNameOf(InvStdDevNode) 
                 )
-                node->SetReqMultiSeqHandlingTo(true);
+                node->SetMaskMissingColumnsToZero();
         }
 
         //if a typical criterion node is used as the training criterion node we assume it requires multiseq handling 
         //this is for backward compatibility
         for (auto & node : m_finalCriteria)
             if (IsTypicalCriterionNode(node))
-                node->SetReqMultiSeqHandlingTo(true);
+                node->SetMaskMissingColumnsToZero();
 
         for (auto & node : m_evalNodes)
             if (IsTypicalCriterionNode(node))
-                node->SetReqMultiSeqHandlingTo(true);
+                node->SetMaskMissingColumnsToZero();
     }
 
     template<class N> void ComputationNetwork::GetNodesRequiringX(std::list<ComputationNodeBasePtr> & nodesRequirePreComputation, const ComputationNodeBasePtr rootNode, bool checkComputed)
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index b679dca2b110..7606784006ed 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -562,7 +562,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // TODO: in the future, these will be different on different nodes; and probably should be propagated by nodes themselves, like functionValues
         for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end(); nodeIter++)
         {
-            if ((*nodeIter)->ReqMultiSeqHandling())
+            // TODO: we should just always set the real layout; the nodes themselves should know to ignore it based on NeedToMaskMissingColumnsToZero()
+            // MaskMissingColumnsToZero() will test whether the layout is all none, and then skip.
+            // This is the only place where ResetBound() is ever called on a node. Hence, we could test NeedToMaskMissingColumnsToZero() instead.
+            // Note that NeedToMaskMissingColumnsToZero() is true only where it is necessary; that is, most node have it set to false (since most nodes can just map garbage-in-garbage-out).
+            if ((*nodeIter)->NeedToMaskMissingColumnsToZero())
                 (*nodeIter)->ResetBound(m_pMBLayout);
             else
                 (*nodeIter)->ResetBound(m_pMBNoLayout);
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 09575913da3e..887da5e18abe 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -79,7 +79,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_indexInLoop(0),
             m_visited(false),
             m_inStack(false),
-            m_reqMultiSeqHandling(false),
+            m_maskMissingColumnsToZero(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name)
         {
         }
@@ -279,8 +279,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool& NeedGradient() { return m_needGradient; }
         const bool& NeedGradient() const { return m_needGradient; }
 
-        void SetReqMultiSeqHandlingTo(const bool v) { m_reqMultiSeqHandling = v; }
-        bool ReqMultiSeqHandling() const { return m_reqMultiSeqHandling; }
+        void SetMaskMissingColumnsToZero() { m_maskMissingColumnsToZero = true; }
+        bool NeedToMaskMissingColumnsToZero() const { return m_maskMissingColumnsToZero; }
 
         void InitRecurrentNode()    // this initialization says that this node is not inside a loop
         {
@@ -624,10 +624,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         DEVICEID_TYPE m_deviceId; //CPU=-1, >=0 GPU
         bool m_needGradient;  //only used for leaf, i.e., learnable parameters, etc.
-        bool m_reqMultiSeqHandling;  // indicates whether the results of operation should be masked to handle the cases that the utterances have different lengths when grouped together as a minibatch.
+        bool m_maskMissingColumnsToZero;  // indicates whether the results of operation should be masked to handle the cases that the utterances have different lengths when grouped together as a minibatch.
         // ^^ This decides whether the node gets passed the full layout with flags or only the one without flags
         //    and this is only ever tested in MaskMissingColumnsToZero(), of which two versions exist, one in ComputationNode and one in ClassBasedCrossEntropyWithSoftmaxNode
-        // TODO: rename this to reflect that it affects only masking
+        // Pertinent reduction operations (criterion nodes and gradient computation) always perform masking.
+        // Hence, this flag is only needed for special use cases where regular matrix ops are used for a 'reduce' operation.
         size_t m_inputWidth, m_inputHeight, m_inputChannels;  //how to interpret each column in the input as an image
         size_t m_outputWidth, m_outputHeight, m_outputChannels;  //how to interpret each column in the output as an image
 
@@ -861,6 +862,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // However, nodes that 'reduce' minibatches (e.g. computing the sum of all frames across all sequences) must deal with the garbage.
         // This function sets those to 0, assuming that now they can be reduced without affecting the result.
         // This function can operate on the whole range or on a selected single frame and/or a single sequence.
+        // It is indirectly guarded by the m_maskMissingColumnsToZero flag, which, if false, will install a layout with IsAllNone() to be true. TODO: we better always install the same layout, and instead test m_maskMissingColumnsToZero here.
+        // Note that existing 'reduce' style operations--the criterion nodes and gradient computation--already call this.
         bool MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, size_t timeIdxInSeq = SIZE_MAX, size_t seqIndex = SIZE_MAX) const
         {
             bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
@@ -1200,7 +1203,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 node->m_functionValues = m_functionValues; 
                 node->m_gradientValues = m_gradientValues;
 
-                node->m_reqMultiSeqHandling = m_reqMultiSeqHandling;
+                node->m_maskMissingColumnsToZero = m_maskMissingColumnsToZero;
             }
         }
 
@@ -1297,8 +1300,8 @@ protected:  \
     using Base::m_visitedOrder; using Base::m_index; using Base::m_lowLink; using Base::m_visited; using Base::m_inStack; \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
-    using Base::m_reqMultiSeqHandling; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
-    using Base::DataSlice; using Base::VALUE; using Base::GRADIENT; \
+    using Base::m_maskMissingColumnsToZero; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
+    using Base::DataSlice; using Base::ValueSlice; using Base::GradientSlice; using Base::SetMaskMissingColumnsToZero; \
     using Base::m_children; using Base::m_deviceId; using Base::m_evalTimeStamp; using Base::m_functionValues; using Base::m_gradientValues; \
     using Base::m_inputChannels; using Base::m_inputHeight; using Base::m_inputWidth; using Base::m_needGradient; using Base::m_nodeName; \
     using Base::m_outputChannels; using Base::m_outputHeight; using Base::m_outputWidth; using Base::s_constOnes; using Base::s_timeStampCounter; \
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 8913a71f9f94..68a31eec8a61 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -530,7 +530,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;
         void Init(size_t row_size, size_t col_size)
         {
-            SetReqMultiSeqHandlingTo(true);
+            SetMaskMissingColumnsToZero();
             m_functionValues.Resize(row_size, col_size);
         }
     public:
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index cba78b3c271d..bd81077a2ea0 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -36,7 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     private:
         void Init(size_t row_size, size_t col_size, ElemType initialActivationValue = (ElemType)DEFAULT_HIDDEN_ACTIVATION)
         {
-            SetReqMultiSeqHandlingTo(true);
+            SetMaskMissingColumnsToZero();
             m_initialActivationValue = initialActivationValue;
             m_timeStep = 1;
             m_functionValues.Resize(row_size, col_size);
@@ -517,7 +517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_use_errors_from_future_minibatch(false),
             m_DefaultState((ElemType)DEFAULT_HIDDEN_ACTIVATION)
         {
-            SetReqMultiSeqHandlingTo(true);
+            SetMaskMissingColumnsToZero();
         }
 
         virtual const std::wstring OperationName() const { return TypeName(); }

From a0c4aa9001e4928aaaf6500047696e0549ea7b77 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 17:27:33 -0700
Subject: [PATCH 36/44] renamed UseCustomizedMultiSeqHandling() to
 NodeDoesItsOwnCustomizedMissingColumnsMasking() for clarity--this stuff will
 eventually go away once we allow inconsistent layouts

---
 .../CompositeComputationNodes.h                |  2 +-
 .../ComputationNode.h                          | 14 +++++++-------
 .../EvaluationCriterionNodes.h                 |  2 +-
 .../InputAndParamNodes.h                       |  2 +-
 .../CNTKComputationNetworkLib/RecurrentNodes.h |  2 +-
 .../TrainingCriterionNodes.h                   | 18 +++++++++---------
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
index 077daa841cbe..8469e795dfe0 100644
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@@ -1068,7 +1068,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() 
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() 
         { 
            return true; 
         }
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 887da5e18abe..bc111a6d0aa9 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -810,7 +810,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             EvaluateThisNode();     // this is a call to the virtual function that implements the actual operation
 
-            if (!UseCustomizedMultiSeqHandling())       // this means the node does it by itself; if not, we do it for the node
+            if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())       // this means the node does it by itself; if not, we do it for the node
                 MaskMissingColumnsToZero(m_functionValues);
         }
 
@@ -821,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             EvaluateThisNode(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
-            if (!UseCustomizedMultiSeqHandling())
+            if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
                 MaskMissingColumnsToZero(m_functionValues, timeIdxInSeq);
         }
 
@@ -1059,7 +1059,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             for (size_t i=0; i<m_children.size(); i++)
             {
-                if (!UseCustomizedMultiSeqHandling())
+                if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
                     MaskMissingColumnsToZero(m_gradientValues);
 
                 ComputationNodePtr child = Inputs(i);
@@ -1088,7 +1088,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             for (size_t i=0; i<m_children.size(); i++)
             {
-                if (!UseCustomizedMultiSeqHandling())
+                if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
                     MaskMissingColumnsToZero(m_gradientValues, timeIdxInSeq);
 
                 ComputationNodePtr child = Inputs(i);
@@ -1225,8 +1225,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void SetErrorsFromFutureMinibatch(Matrix<ElemType>&) {}
 
         // indicatess whether special handling is needed.The standard handleing will be just mask the function values after the evalaution and mask the gradient before gradiant computation for the children. this is not valid for all criterion nodes whose result is a scalar.
-        // defined by training/eval criteria (and the soon-to-be-deprecated PairNode, LSTMNode)
-        virtual bool UseCustomizedMultiSeqHandling() { return false; }
+        // overridden to return true by training/eval criteria (and the soon-to-be-deprecated PairNode, LSTMNode)
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return false; }
 
     protected:
 
@@ -1300,7 +1300,7 @@ protected:  \
     using Base::m_visitedOrder; using Base::m_index; using Base::m_lowLink; using Base::m_visited; using Base::m_inStack; \
     using Base::m_indexInLoop; \
     using Base::m_pMBLayout; \
-    using Base::m_maskMissingColumnsToZero; using Base::UseCustomizedMultiSeqHandling; using Base::GetNumParallelSequences; \
+    using Base::m_maskMissingColumnsToZero; using Base::NodeDoesItsOwnCustomizedMissingColumnsMasking; using Base::GetNumParallelSequences; \
     using Base::DataSlice; using Base::ValueSlice; using Base::GradientSlice; using Base::SetMaskMissingColumnsToZero; \
     using Base::m_children; using Base::m_deviceId; using Base::m_evalTimeStamp; using Base::m_functionValues; using Base::m_gradientValues; \
     using Base::m_inputChannels; using Base::m_inputHeight; using Base::m_inputWidth; using Base::m_needGradient; using Base::m_nodeName; \
diff --git a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
index a765ca6d4a8c..c44a91986ce0 100644
--- a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@@ -141,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
 protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
 
     private:
         Matrix<ElemType> m_maxIndexes0, m_maxIndexes1;
diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
index 68a31eec8a61..eed17fbc8c28 100644
--- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h
@@ -634,7 +634,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         static const std::wstring TypeName() { return L"PairNetwork"; }
 protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
 
     };
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index bd81077a2ea0..3a3aceeb291c 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -1571,7 +1571,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
 
     protected:
         size_t m_inputDim;
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index a13126e33628..5be3f0cc42f1 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -138,7 +138,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     private:
         Matrix<ElemType> m_leftMinusRight;
     };
@@ -311,7 +311,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     protected:
         Matrix<ElemType> m_logSoftmaxOfRight;
         Matrix<ElemType> m_softmaxOfRight;       
@@ -461,7 +461,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     private:
         // matrix value passed from evaluate to computePartial
         Matrix<ElemType> m_logOfRight;
@@ -561,7 +561,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     private:
         Matrix<ElemType> m_gradientOfL1Norm;    // temporary
     };
@@ -646,7 +646,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_temp.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     private:
         Matrix<ElemType> m_temp;
     };
@@ -825,7 +825,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_grdToSoftMaxInput.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     protected:
         Matrix<ElemType> m_logSoftmax;
         Matrix<ElemType> m_softMax;
@@ -1176,7 +1176,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_grdToSoftMaxInput.TransferToDeviceIfNotThereAndNotAutoPlace(deviceId, true);
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     protected:
         Matrix<ElemType> m_logSoftmax;
         Matrix<ElemType> m_softMax;
@@ -1494,7 +1494,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     private:
         Matrix<ElemType> mAlpha;    // TODO: m_Alpha etc.
         Matrix<ElemType> mBeta;
@@ -1600,7 +1600,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_children[2] = prediction;
         }
     protected:
-        virtual bool UseCustomizedMultiSeqHandling() { return true; }
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
     };
 
     template class DummyCriterionNode<float>; 

From 9f4fcf036526db87cc0c029b020db1fe63a23abf Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 17:41:14 -0700
Subject: [PATCH 37/44] included test for NeedToMaskMissingColumnsToZero() into
 the calls to MaskMissingColumnsToZero() that would not do anything if this
 flag is set; renamed ResetBound() to SetMBLayout()

---
 .../ComputationNetwork.h                         |  6 +++---
 .../CNTKComputationNetworkLib/ComputationNode.h  | 16 ++++++++--------
 .../CNTKComputationNetworkLib/RecurrentNodes.h   |  6 +++---
 .../TrainingCriterionNodes.h                     |  6 +++---
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 7606784006ed..38f8e72cfc1d 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -564,12 +564,12 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         {
             // TODO: we should just always set the real layout; the nodes themselves should know to ignore it based on NeedToMaskMissingColumnsToZero()
             // MaskMissingColumnsToZero() will test whether the layout is all none, and then skip.
-            // This is the only place where ResetBound() is ever called on a node. Hence, we could test NeedToMaskMissingColumnsToZero() instead.
+            // This is the only place where SetMBLayout() is ever called on a node. Hence, we could test NeedToMaskMissingColumnsToZero() instead.
             // Note that NeedToMaskMissingColumnsToZero() is true only where it is necessary; that is, most node have it set to false (since most nodes can just map garbage-in-garbage-out).
             if ((*nodeIter)->NeedToMaskMissingColumnsToZero())
-                (*nodeIter)->ResetBound(m_pMBLayout);
+                (*nodeIter)->SetMBLayout(m_pMBLayout);
             else
-                (*nodeIter)->ResetBound(m_pMBNoLayout);
+                (*nodeIter)->SetMBLayout(m_pMBNoLayout);
             (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
         }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index bc111a6d0aa9..0f041f709972 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -166,7 +166,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void SetFunctionAndGradientSize(const int numSamples) = 0;
 
-        virtual void ResetBound(MBLayoutPtr pMBLayout)
+        virtual void SetMBLayout(MBLayoutPtr pMBLayout)
         {
             assert(pMBLayout->GetNumTimeSteps() == pMBLayout->GetSize());  // TODO: move this check into MBLayout
             m_pMBLayout = pMBLayout;
@@ -810,7 +810,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             EvaluateThisNode();     // this is a call to the virtual function that implements the actual operation
 
-            if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())       // this means the node does it by itself; if not, we do it for the node
+            if (NeedToMaskMissingColumnsToZero() && !NodeDoesItsOwnCustomizedMissingColumnsMasking())       // this means the node does it by itself; if not, we do it for the node
                 MaskMissingColumnsToZero(m_functionValues);
         }
 
@@ -821,7 +821,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             EvaluateThisNode(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
-            if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
+            if (NeedToMaskMissingColumnsToZero() && !NodeDoesItsOwnCustomizedMissingColumnsMasking())
                 MaskMissingColumnsToZero(m_functionValues, timeIdxInSeq);
         }
 
@@ -866,7 +866,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Note that existing 'reduce' style operations--the criterion nodes and gradient computation--already call this.
         bool MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, size_t timeIdxInSeq = SIZE_MAX, size_t seqIndex = SIZE_MAX) const
         {
-            bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
+            bool foundLabelOrFeatureMissing = false; /// set to true if either nolabel or feature missing is processed
 
             if (!m_pMBLayout->IsAllNone())
             {
@@ -889,12 +889,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         for (size_t id = startS; id < endS; id++)
                             if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel | MinibatchPackingFlags::NoFeature))
                                 matrixToBeMasked.ColumnSlice(t * nS  +  id, 1).SetValue(0);
-                        processedExistsNoLabelorFeatureMissing = true;
+                        foundLabelOrFeatureMissing = true;
                     }
                 }
             }
 
-            return processedExistsNoLabelorFeatureMissing;
+            return foundLabelOrFeatureMissing;
         }
 
         /*
@@ -1059,7 +1059,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             for (size_t i=0; i<m_children.size(); i++)
             {
-                if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
+                if (NeedToMaskMissingColumnsToZero() && !NodeDoesItsOwnCustomizedMissingColumnsMasking())
                     MaskMissingColumnsToZero(m_gradientValues);
 
                 ComputationNodePtr child = Inputs(i);
@@ -1088,7 +1088,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             for (size_t i=0; i<m_children.size(); i++)
             {
-                if (!NodeDoesItsOwnCustomizedMissingColumnsMasking())
+                if (NeedToMaskMissingColumnsToZero() && !NodeDoesItsOwnCustomizedMissingColumnsMasking())
                     MaskMissingColumnsToZero(m_gradientValues, timeIdxInSeq);
 
                 ComputationNodePtr child = Inputs(i);
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 3a3aceeb291c..2999806394e1 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -96,12 +96,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static const std::wstring TypeName() { return L"DelayedValue"; }
 
         //Set sentence boundary information according to a specified time step. 
-        virtual void ResetBound(MBLayoutPtr pMBLayout)
+        virtual void SetMBLayout(MBLayoutPtr pMBLayout)
         {
             if (m_timeStep <= 0)
                 LogicError("timeStep should be 1 or larger");
 
-            Base::ResetBound(pMBLayout);
+            Base::SetMBLayout(pMBLayout);
 
             // in this node we use a post-processed version of the shared pMBLayout
             // This is to decide which frames should be filled with default values. 
@@ -1353,7 +1353,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 //boundary.ColumnSlice(0, 1).SetValue(((int) MinibatchPackingFlags::SequenceStart));
                 //minibatchPackingFlags[1] = MinibatchPackingFlags::SequenceStart;
                 pMBLayout->Set(0, 1, MinibatchPackingFlags::SequenceStart); // TODO: strange--start at frame[1] instead of [0]?
-                Base::ResetBound(pMBLayout);
+                Base::SetMBLayout(pMBLayout);
 
                 f0 = Inputs(0)->FunctionValues();
                 f1 = Inputs(1)->FunctionValues();
diff --git a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
index 5be3f0cc42f1..36017ca63f2f 100644
--- a/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/TrainingCriterionNodes.h
@@ -1098,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t id = j % nS;  // this is the stream
             return Base::MaskMissingColumnsToZero(matrixToBeMasked, t, id);
 #if 0       // old version prior to merging with Base version
-            bool processedExistsNoLabelorFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
+            bool foundLabelOrFeatureMissing = false; /// set to true if either nolabel or feature missing is processed 
 
             if (!m_pMBLayout->IsAllNone())
             {
@@ -1107,12 +1107,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     if (m_pMBLayout->Is(id, t, MinibatchPackingFlags::NoLabel))
                     {
                         matrixToBeMasked.ColumnSlice(t * nS + id,1).SetValue(0);
-                        processedExistsNoLabelorFeatureMissing = true;
+                        foundLabelOrFeatureMissing = true;
                     }
                 }
             }
 
-            return processedExistsNoLabelorFeatureMissing;
+            return foundLabelOrFeatureMissing;
 #endif
         }
 

From cde6220292702e848fbb6484e01207dfc80f6224 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 21 Sep 2015 18:11:39 -0700
Subject: [PATCH 38/44] (comment)

---
 MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 38f8e72cfc1d..60ab38482e88 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -1371,6 +1371,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     };
 
     // note: this is called to write into our existing MBLayout instance
+    // TODO: This is broken. Instead, we should pass this from the reader, or better, do batching inside here.
+    //       The problem is that we cannot post-process. E.g. is the layout guaranteed to reflect the minibatch size, in the case of no recurrence??
     const MBLayoutPtr & GetMBLayoutPtr() { return m_pMBLayout; }
 
 protected:

From a2f111a88b342eec9a4b359f32d2fd51ff15233e Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 22 Sep 2015 10:29:39 -0700
Subject: [PATCH 39/44] made ComputeInputPartialSRP() and EvaluateThisNodeSRP()
 class members and reduced the parameter list; added a comment that analyzes
 whether we need that secondary pMBNoLayout--answer is no

---
 Common/Include/basetypes.h                    | 22 +++++++
 .../RecurrentNodes.h                          | 64 +++++++++----------
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h
index bfe8a64d1add..013a217ecb10 100644
--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@@ -1054,6 +1054,28 @@ class RegisterModule
 
 // why is this in basetypes.h?
 // boundary flags for a frame
+// (note for refactoring:) This is currently used by
+//  - RecurrentNodes: SetMBLayout(), ComputeInputPartialSRP(), EvaluateThisNodeSRP(),    .. check SentenceBegin_or_End
+//    (plus PastValueNode and FutureValueNode base class template parameter)
+//  - SimpleEvaluator.h: FindBestPath(), FindBestPathWithVariableLength()   --doing a bad hack, pretending MBs of 1 frame
+// deprecated:
+//  - LSTMNode
+// through ComputationNode::MaskMissingColumnsToZero():
+//  - nodes where the user explicitly requested masking (NeedToMaskMissingColumnsToZero() == true)
+//  - ComputeGradientForChildren()
+//  - all training and evaluation criterion nodes   .. TODO: double-confirm it's all Training nodes; but those also have NodeDoesItsOwnCustomizedMissingColumnsMasking() == true
+// in core classes:
+//  - ComputationNetwork: GetNumSamplesWithLabel(), MaskMissingColumnsToZero()  --both are cheap in case of no flags set
+//  - Matrix.h
+//  - SGD::DecimateMinibatchWithSentences() (should be done differently)
+// and readers that generate the flags:
+//  - HTKMLFReader::GetMinibatchToTrainOrTest()
+//  - BatchLUSequenceReader::EnsureDataAvailable(), GetMinibatch(), DataEnd()
+//  - EvalReader::CopyMBLayoutTo()
+//  - BatchSequenceReader::SetSentenceBegin()
+// others:
+//  - MathPerformanceTests.cpp
+// ==> conclusion: safe to ALWAYS pass the full layout, will not be inefficient
 enum class MinibatchPackingFlags : char     // (note: not using unsigned char because these go into a matrix, and we use Matrix<char>, since we use it as a data holder)
 {
     None = 0,
diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
index 2999806394e1..811dbe03f7ec 100644
--- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h
@@ -161,16 +161,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("PastValue and FutureValue operations only take one input.");
 
             assert(m_functionValues.GetNumRows() == GradientValues().GetNumRows());
-            assert(m_pMBLayout);
 
-            const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(frameRange.t());
-            ComputeInputPartialSRP(frameRange, m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
+            ComputeInputPartialRP(frameRange);
         }
 
-        static void WINAPI ComputeInputPartialSRP(const FrameRange & frameRange, int timeStep,
-                                                  Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues,
-                                                  const Matrix<float>& colBoundaryFlags, MinibatchPackingFlags minibatchPackingFlags)
+        void ComputeInputPartialRP(const FrameRange & frameRange)
         {
+            // this is the result of refactoring; feel free to clean up further:
+            int timeStep = m_timeStep;
+            Matrix<ElemType>& inputGradientValues = Inputs(0)->GradientValues();
+            const Matrix<ElemType>& gradientValues = GradientValues();
+            const auto frameLayout = m_pShiftedMBLayout->GetFrame(frameRange.t());
+            const Matrix<float>& colBoundaryFlags = frameLayout.first;
+            const MinibatchPackingFlags & minibatchPackingFlags = frameLayout.second;
+
             size_t timeIdxInSeq = frameRange.t();
             size_t mNbr = frameRange.NumCols();
             assert(timeIdxInSeq >= 0);
@@ -206,10 +210,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // this one differs in the starting condition
         virtual void /*ComputationNode::*/EvaluateThisNode(const FrameRange & frameRange) = 0;
 
-        void EvaluateThisNodeSRP(const FrameRange & frameRange, const int timeStep,
-                                 Matrix<ElemType>& functionValues, const Matrix<ElemType>& delayedActivation, const Matrix<ElemType>& inputFunctionValues,
-                                 const ElemType & initStateValue, const Matrix<float> & colBoundaryFlags, const MinibatchPackingFlags minibatchPackingFlags)
+        void EvaluateThisNodeRP(const FrameRange & frameRange)
         {
+            // this is the result of refactoring; feel free to clean up further
+            const int timeStep = m_timeStep;
+            Matrix<ElemType>& functionValues = m_functionValues;
+            const Matrix<ElemType>& delayedActivation = m_delayedActivation;
+            const Matrix<ElemType>& inputFunctionValues = Inputs(0)->FunctionValues();
+            const ElemType & initStateValue = m_initialActivationValue;
+            const auto colBoundaryFlags1 = m_pShiftedMBLayout->GetFrame(frameRange.t());
+            const Matrix<float> & colBoundaryFlags = colBoundaryFlags1.first;
+            const MinibatchPackingFlags & minibatchPackingFlags = colBoundaryFlags1.second;
+
             size_t timeIdxInSeq = frameRange.t();
             size_t mNbr = frameRange.NumCols();
             assert(timeStep > 0);
@@ -346,7 +358,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define UsingDelayedValueNodeMembers UsingComputationNodeMembers; \
     using Base::m_initialActivationValue; using Base::m_delayedActivation; using Base::m_timeStep; \
     using Base::m_pShiftedMBLayout; using Base::m_historyAlreadySet; \
-    using Base::ComputeInputPartialSRP; using Base::EvaluateThisNodeSRP
+    using Base::ComputeInputPartialRP; using Base::EvaluateThisNodeRP
 
     // =======================================================================
     // PastValueNode -- delay node
@@ -375,12 +387,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("PastValue and FutureValue operations only take one input.");
 
             int nbrSamples = GradientValues().GetNumCols() / GetNumParallelSequences(); 
+            // TODO: call the looping version below to avoid code dup
             for (int timeIdxInSeq = nbrSamples - 1; timeIdxInSeq >= 0; timeIdxInSeq--)
-            {
-                // TODO: call the looping version below to avoid code dup
-                const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
-            }
+                ComputeInputPartialRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
         }
 
         // TODO: why is this loop not in th underlying execution engine? This node should not have to know about this.
@@ -389,12 +398,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_timeStep > 0);
 
             int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
+            // TODO: call the looping version below to avoid code dup
             for (int timeIdxInSeq = 0; timeIdxInSeq < nbrSamples; timeIdxInSeq++)
-            {
-                // TODO: call the looping version below to avoid code dup
-                const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
-            }
+                EvaluateThisNodeRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
             //set the past activity to be used by next minibatch
             m_delayedActivation = Inputs(0)->FunctionValues();
@@ -409,8 +415,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (frameRange.t() == 0 && m_historyAlreadySet == false)
                 m_delayedActivation = Inputs(0)->FunctionValues();
             
-            const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(frameRange.t());
-            EvaluateThisNodeSRP(frameRange, m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
+            EvaluateThisNodeRP(frameRange);
         }
     };
 
@@ -445,12 +450,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 InvalidArgument("PastValue and FutureValue operations only take one input.");
 
             int nbrSamples = GradientValues().GetNumCols() / GetNumParallelSequences();
+            // TODO: call the looping version below to avoid code dup
             for (int timeIdxInSeq = 0; timeIdxInSeq < nbrSamples; timeIdxInSeq++)
-            {
-                // TODO: call the looping version below to avoid code dup
-                const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                ComputeInputPartialSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, Inputs(0)->GradientValues(), GradientValues(), colBoundaryFlags.first, colBoundaryFlags.second);
-            }
+                ComputeInputPartialRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
         }
 
         virtual void EvaluateThisNode()
@@ -459,10 +461,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             int nbrSamples = Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences();
             for (int timeIdxInSeq = nbrSamples - 1; timeIdxInSeq >= 0; timeIdxInSeq--)
-            {
-                const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(timeIdxInSeq);
-                EvaluateThisNodeSRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()), m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
-            }
+                EvaluateThisNodeRP(FrameRange(timeIdxInSeq, GetNumParallelSequences()));
 
             //set the future activity to be used by next minibatch
             m_delayedActivation = Inputs(0)->FunctionValues();
@@ -475,8 +474,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (frameRange.t() == Inputs(0)->FunctionValues().GetNumCols() / GetNumParallelSequences() - 1)
                 m_delayedActivation = Inputs(0)->FunctionValues();
 
-            const auto colBoundaryFlags = m_pShiftedMBLayout->GetFrame(frameRange.t());
-            EvaluateThisNodeSRP(frameRange, m_timeStep, m_functionValues, m_delayedActivation, Inputs(0)->FunctionValues(), m_initialActivationValue, colBoundaryFlags.first, colBoundaryFlags.second);
+            EvaluateThisNodeRP(frameRange);
         }
     };
 

From c86a0f12c3bf6b188b53f8ceab320965b4400534 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 22 Sep 2015 11:00:50 -0700
Subject: [PATCH 40/44] removed pMBNoLayout, Network now passes the same layout
 to all nodes now (I checked, I think it should not cause inefficiencies)

---
 .../CNTKComputationNetworkLib/ComputationNetwork.h     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
index 60ab38482e88..037a860788dc 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@@ -75,7 +75,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // -----------------------------------------------------------------------
 
     ComputationNetwork(DEVICEID_TYPE deviceId = AUTOPLACEMATRIX) :
-        m_deviceId(deviceId), m_pMBLayout(make_shared<MBLayout>()), m_pMBNoLayout(make_shared<MBLayout>())
+        m_deviceId(deviceId), m_pMBLayout(make_shared<MBLayout>())//, m_pMBNoLayout(make_shared<MBLayout>())
     {
         m_randomSeedOffset = 0;
         m_actualMBSize = 0;
@@ -539,7 +539,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // We have a matching layout structure that matches pMBLayout in number of sequences while not having any flags set.
         // This is used for nodes that do not need recurrent processing, but can be done in batch.
         // TODO: Does it harm if we have flags, for those that can be done in batch? I.e. why don't we just always provide flags?
-        m_pMBNoLayout->Resize(m_pMBLayout->GetNumParallelSequences(), 0);   // TODO: this is not nice, but we currently have no trigger to detect changes in layout
+        //m_pMBNoLayout->Resize(m_pMBLayout->GetNumParallelSequences(), 0);   // TODO: this is not nice, but we currently have no trigger to detect changes in layout
 
         // prepare to compute with the subnetwork that this rootNode depends on, including
         //  - auto-detecting recurrent loops
@@ -566,10 +566,10 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             // MaskMissingColumnsToZero() will test whether the layout is all none, and then skip.
             // This is the only place where SetMBLayout() is ever called on a node. Hence, we could test NeedToMaskMissingColumnsToZero() instead.
             // Note that NeedToMaskMissingColumnsToZero() is true only where it is necessary; that is, most node have it set to false (since most nodes can just map garbage-in-garbage-out).
-            if ((*nodeIter)->NeedToMaskMissingColumnsToZero())
+            //if ((*nodeIter)->NeedToMaskMissingColumnsToZero())
                 (*nodeIter)->SetMBLayout(m_pMBLayout);
-            else
-                (*nodeIter)->SetMBLayout(m_pMBNoLayout);
+            //else
+            //    (*nodeIter)->SetMBLayout(m_pMBNoLayout);
             (*nodeIter)->VerifyNumParallelSequences(GetNumParallelSequences());
         }
 

From 8b67a801a030b98d88bece4bdb603e86332419ca Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 22 Sep 2015 11:30:13 -0700
Subject: [PATCH 41/44] SetRequestNodesMultiSeqHandling() changed so that,
 instead of fixing things for the user under the hood, it instead forces users
 to fix it on their side through runtime checks

---
 .../ComputationNetwork.cpp                    | 26 ++++++++++++++++---
 .../ComputationNode.h                         |  8 +++---
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
index 42c73d02263a..1a1cf70bc1f1 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@@ -312,6 +312,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return false;
     }
 
+    // note: all of these have NodeDoesItsOwnCustomizedMissingColumnsMasking() returning true
     bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
     {
         if (nodePtr->OperationName() == OperationNameOf(SquareErrorNode) ||
@@ -336,23 +337,40 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //SumElements node will generate a scalar value and so it should never require special handling
             //TransposeNode will change the size of columns and so it should also not included for special handling
             //their child node should instead
+#if 0
             if (node->OperationName() != OperationNameOf(SumElementsNode) &&
                 node->OperationName() != OperationNameOf(TransposeNode) &&
                 node->OperationName() != OperationNameOf(MeanNode) &&
                 node->OperationName() != OperationNameOf(InvStdDevNode) 
                 )
                 node->SetMaskMissingColumnsToZero();
+#else
+            if (node->OperationName() == OperationNameOf(SumElementsNode) ||
+                node->OperationName() == OperationNameOf(TransposeNode) ||
+                node->OperationName() == OperationNameOf(MeanNode) |
+                node->OperationName() == OperationNameOf(InvStdDevNode))
+            {
+                RuntimeError("SetRequestNodesMultiSeqHandling: NodesReqMultiSeqHandling cannot be used with operation '%ls'\nIn the past, CNTK silently fixed this; now please change your NDL instead", node->OperationName().c_str());
+            }
+            node->SetMaskMissingColumnsToZero();
+#endif
         }
 
-        //if a typical criterion node is used as the training criterion node we assume it requires multiseq handling 
-        //this is for backward compatibility
+        // if a typical criterion node is used as the training criterion node we assume it requires multiseq handling 
+        // this is for backward compatibility
+        // All of these have NodeDoesItsOwnCustomizedMissingColumnsMasking() return true, i.e. they will not have MaskMissingColumnsToZero() auto-called from Network.
+        // Hence, instead of setting the flag, we just ensure that this is true.
         for (auto & node : m_finalCriteria)
             if (IsTypicalCriterionNode(node))
-                node->SetMaskMissingColumnsToZero();
+                //node->SetMaskMissingColumnsToZero();
+                if (!node->NodeDoesItsOwnCustomizedMissingColumnsMasking())
+                    LogicError("criterion %ls's NodeDoesItsOwnCustomizedMissingColumnsMasking() function must return true", node->OperationName().c_str());
 
         for (auto & node : m_evalNodes)
             if (IsTypicalCriterionNode(node))
-                node->SetMaskMissingColumnsToZero();
+                //node->SetMaskMissingColumnsToZero();
+                if (!node->NodeDoesItsOwnCustomizedMissingColumnsMasking())
+                    LogicError("criterion %ls's NodeDoesItsOwnCustomizedMissingColumnsMasking() function must return true", node->OperationName().c_str());
     }
 
     template<class N> void ComputationNetwork::GetNodesRequiringX(std::list<ComputationNodeBasePtr> & nodesRequirePreComputation, const ComputationNodeBasePtr rootNode, bool checkComputed)
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index 0f041f709972..7750460f4602 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -222,6 +222,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return m_pMBLayout->GetNumParallelSequences();
         }
 
+        // indicates whether special handling is needed.The standard handleing will be just mask the function values after the evalaution and mask the gradient before gradiant computation for the children. this is not valid for all criterion nodes whose result is a scalar.
+        // overridden to return true by training/eval criteria (and the soon-to-be-deprecated PairNode, LSTMNode)
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return false; }
+
         int64_t UpdateEvalTimeStamp()
         {
             m_evalTimeStamp = atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);    // TODO: does this really need to be atomic? We are not multi-threaded
@@ -1224,10 +1228,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         virtual void GetErrorsToPreviousMinibatch(Matrix<ElemType>&) {}
         virtual void SetErrorsFromFutureMinibatch(Matrix<ElemType>&) {}
 
-        // indicatess whether special handling is needed.The standard handleing will be just mask the function values after the evalaution and mask the gradient before gradiant computation for the children. this is not valid for all criterion nodes whose result is a scalar.
-        // overridden to return true by training/eval criteria (and the soon-to-be-deprecated PairNode, LSTMNode)
-        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return false; }
-
     protected:
 
         Matrix<ElemType> m_functionValues, m_gradientValues;

From 51fb96bdf9182205b7efbf21c56047ffef82e060 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Tue, 22 Sep 2015 12:40:10 -0700
Subject: [PATCH 42/44] Use double precision rsqrt in FSAdagrad kernel when
 ElemType is double

---
 Math/Math/GPUMatrix.cu            | 14 +++++++++-----
 Math/Math/GPUMatrix.h             |  3 +--
 Math/Math/GPUMatrixCUDAKernels.cu | 11 ++++++++++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index cf3fbf4bdef6..4c525b92ba15 100755
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -1278,23 +1278,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, 
-        ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul)
+    void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients,
+                                        GPUMatrix<ElemType>& functionValues, 
+                                        ElemType learnRatePerSample,
+                                        ElemType momentum,
+                                        ElemType adaWeight,
+                                        ElemType adaMul)
     {
         size_t numColsNeeded = 2 * gradients.GetNumCols();
 
-        if (IsEmpty() || GetNumCols() < numColsNeeded)
+        if (IsEmpty() || (GetNumCols() < numColsNeeded))
         {
             Resize(gradients.GetNumRows(), numColsNeeded);
             SetValue(0.0);
         }
 
-        assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded);
+        assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
 
         size_t n = gradients.GetNumElements();
         int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
         _fsadagrad<ElemType><<<blocksPerGrid, threadsPerBlock>>>(n, gradients.m_pArray, m_pArray, m_pArray + n, functionValues.m_pArray,
-            learnRatePerSample, momentum, adaWeight, adaMul);
+                                                                 learnRatePerSample, momentum, adaWeight, adaMul);
     }
 
     template<class ElemType>
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index b80900f95ad7..18a9ef0cb05e 100755
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -124,8 +124,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         ElemType* BufferPointer() const {return m_pArray;}
 
         ElemType Adagrad(GPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
-        void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, 
-            ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
+        void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
         ElemType RmsProp(GPUMatrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
 
         void Reshape(const size_t numRows, const size_t numCols);
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index d45b9e006d87..073fa9d2e1ad 100755
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -1123,7 +1123,16 @@ __global__ void _fsadagrad(CUDA_LONG size, ElemType* grad, ElemType* smoothAda,
         smoothAda[idx] = adaSqr;
         if (adaSqr != 0.0f)
         {
-            ElemType w = adaMul * rsqrtf(adaSqr);
+            ElemType w;
+            if (sizeof(ElemType) == sizeof(double))
+            {
+                w = adaMul * rsqrt(adaSqr);
+            }
+            else
+            {
+                w = adaMul * rsqrtf(adaSqr);
+            }
+
             if (w > 10.0f)
                 w = 10.0f;
             g *= w;

From dce8e659bf4182ada5b6fe48238ed09f0b78143f Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Tue, 22 Sep 2015 13:44:12 -0700
Subject: [PATCH 43/44] Added CPU support for FSAdagrad

---
 Math/Math/CPUMatrix.cpp | 51 +++++++++++++++++++++++++++++++++++++++++
 Math/Math/CPUMatrix.h   |  1 +
 Math/Math/Matrix.cpp    |  7 +++---
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 3bac9705dc7e..f5609d449db0 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -1117,6 +1117,57 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return 1;
     }
 
+    template<class ElemType>
+    void CPUMatrix<ElemType>::FSAdagrad(CPUMatrix<ElemType>& gradients,
+                                        CPUMatrix<ElemType>& functionValues,
+                                        ElemType learnRatePerSample,
+                                        ElemType momentum,
+                                        ElemType adaWeight,
+                                        ElemType adaMul)
+    {
+        size_t numColsNeeded = 2 * gradients.GetNumCols();
+
+        if (IsEmpty() || (GetNumCols() < numColsNeeded))
+        {
+            Resize(gradients.GetNumRows(), numColsNeeded);
+            SetValue(0.0);
+        }
+
+        assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
+
+        size_t n = gradients.GetNumElements();
+        ElemType* grad = gradients.m_pArray;
+        ElemType* smoothAda = m_pArray;
+        ElemType* smoothMom = m_pArray + n;
+        ElemType* val = functionValues.m_pArray;
+#pragma omp parallel for
+        // TODO: Unroll 4-times for better performance leveraging vectorization
+        for (long i = 0; i < n; i++)
+        {
+            ElemType g = grad[i];
+            ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g;
+            smoothAda[i] = adaSqr;
+            if (adaSqr != 0.0f)
+            {
+                ElemType ada = sqrt(adaSqr);
+                ElemType w = adaMul * ((ElemType)1.0 / ada);
+
+                if (w > 10.0f)
+                    w = 10.0f;
+                g *= w;
+            }
+
+            if (momentum > 0.0f)
+            {
+                g = momentum * smoothMom[i] + (1.0f - momentum) * g;
+                smoothMom[i] = g;
+            }
+
+            g *= learnRatePerSample;
+            val[i] -= g;
+        }
+    }
+
     template<class ElemType>
     ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
         ElemType RMS_GAMMA,
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index f9899ecc2936..11f980e44665 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -54,6 +54,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUMatrix<ElemType>& SetColumnSlice(const CPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
 
         ElemType Adagrad(CPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
+        void FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
         ElemType RmsProp(CPUMatrix<ElemType>& gradients,
             ElemType RMS_GAMMA,
             ElemType RMS_WGT_INC,
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index 131b68424e63..870bda9e6821 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -1324,18 +1324,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void Matrix<ElemType>::FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
     {
-        // REVEW alexeyk: hardcoded for now, taken from DBN. Naming is the same as in DBN.
+        // TODO: The values of 'adagradT' and 'targetadagradavdenom' are currently hardcoded constants taken from DBN (empirically determined). 
+        // These should be made configurable if needed
         const size_t adagradT = 2 * 3600 * 100;
+        const ElemType targetadagradavdenom = 0.0025; // 1/400 magic constant
         const ElemType adagradkeepweight = static_cast<ElemType>(exp(-1.0 * mbSize / adagradT));
 
-        const ElemType targetadagradavdenom = 0.0025; // 1/400 magic constant
         static ElemType aggadagradsqrframes = 0;
         aggadagradsqrframes = adagradkeepweight * aggadagradsqrframes + (1.0f - adagradkeepweight) * mbSize;
         const ElemType targetadagradavdenom_x_sqrtadagradsqrframes = static_cast<ElemType>(targetadagradavdenom * sqrt(aggadagradsqrframes));
 
         DISPATCH_MATRIX_ON_FLAG(&gradients,
             &gradients,
-            SetDataLocation(CPU),
+            m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(CPU),
             m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes); SetDataLocation(GPU),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED

From 7777c63919599920a1163fedcf947a0cee45daf5 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 22 Sep 2015 16:34:53 -0700
Subject: [PATCH 44/44] added predefined macro RowStack() (was missing); fixed
 spelling RowStack::m_startRowIndeces -> m_startRowIndices

---
 .../CNTK/ExperimentalNetworkBuilder.cpp        |  2 +-
 .../LinearAlgebraNodes.h                       | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp b/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp
index 94eb70f816ed..e94212ce97e9 100644
--- a/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp
@@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         L"FutureValue(rows, cols, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input /*plus the function args*/ ]\n"
         L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
         L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
+        L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
         L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input /*plus the function args*/ ]\n"
         L"ConvolutionNode(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
         L"MaxPoolingNode(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
@@ -114,7 +115,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         BinaryStandardNode(Plus, leftMatrix, rightMatrix)
         UnaryStandardNode(RectifiedLinear, z)
         //BinaryStandardNode(RowElementTimesNode)
-        //BinaryStandardNode(RowStackNode)
         BinaryStandardNode(Scale, scalarScalingFactor, matrix)
         //BinaryStandardNode(SequenceDecoderNode)
         UnaryStandardNode(Sigmoid, z)
diff --git a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
index a91ed0d81ff6..ddad05026836 100644
--- a/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/LinearAlgebraNodes.h
@@ -447,6 +447,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // -----------------------------------------------------------------------
 
     //this node is used to extract part of the input by rows as the output
+    // TODO: Really? RowStack indicates something different.
     //it has to be continuous segments of rows since each column is treated as one sample
     template<class ElemType>
     class RowStackNode : public ComputationNode<ElemType>
@@ -466,7 +467,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (flags & CopyNodeFlags::copyNodeChildren)
             {
                 node->m_children = m_children;
-                node->m_startRowIndeces = m_startRowIndeces;
+                node->m_startRowIndices = m_startRowIndices;
                 node->m_inputMatrices = m_inputMatrices;
             }
         }
@@ -478,7 +479,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             if (inputIndex >= ChildrenSize())
                 InvalidArgument("RowStack-ComputeInputPartial: inputIndex out of range.");
-            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex + 1] - m_startRowIndeces[inputIndex]);
+            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndices[inputIndex], m_startRowIndices[inputIndex + 1] - m_startRowIndices[inputIndex]);
         }
 
         virtual void /*ComputationNode::*/ComputeInputPartial(const size_t inputIndex, const FrameRange & frameRange)
@@ -489,7 +490,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
             Matrix<ElemType> sliceOutputGrad = GradientSlice(frameRange/*TODO: delete this:*/.Check(frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences(), m_pMBLayout));
 
-            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
+            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndices[inputIndex], m_startRowIndices[inputIndex+1] - m_startRowIndices[inputIndex]);
         }
 
         static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex, const size_t numRows)
@@ -509,7 +510,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, frameRange.t() * GetNumParallelSequences(), GetNumParallelSequences());
         }
 
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+        // TODO: change to FrameRange
+        void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
         {
             functionValues.AssignRowStackValuesOf(inputMatrices, sliceStartCol, sliceNumCols);
 #if NANCHECK
@@ -528,11 +530,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("RowStack operation: the input node is NULL.");
 
             size_t numCols = Inputs(0)->FunctionValues().GetNumCols();
-            m_startRowIndeces.resize(ChildrenSize()+1);
+            m_startRowIndices.resize(ChildrenSize()+1);
             m_inputMatrices.resize(ChildrenSize());
 
             size_t totalRows = 0;
-            m_startRowIndeces[0] = 0;
+            m_startRowIndices[0] = 0;
 
             for (int i = 0; i < ChildrenSize(); i++)
             {
@@ -549,7 +551,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 totalRows += numRows;
                 m_inputMatrices[i] = &childMatrix;
-                m_startRowIndeces[i + 1] = m_startRowIndeces[i] + numRows;
+                m_startRowIndices[i + 1] = m_startRowIndices[i] + numRows;
             }
 
             FunctionValues().Resize(totalRows, numCols);
@@ -575,7 +577,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     private:
-        std::vector<size_t> m_startRowIndeces; //start row number in the stacked matrix of each input (child)
+        std::vector<size_t> m_startRowIndices; //start row number in the stacked matrix of each input (child)
         std::vector<const Matrix<ElemType>*> m_inputMatrices;
     };