From 342b1ba460415fdd2354767a24ad4ced41da8005 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 20 Jan 2016 21:33:38 -0800
Subject: [PATCH] changed the mapping of tensors onto the Matrix storage
 objects: In case of no MBLayout, the Matrix objects now always have column
 dimension 1. Actual matrices (as used by TimesNode) are now 2D tensors. As
 part of this, GetNumCols()/-Rows() no longer exist, but were split into
 GetSampleMatrixCols()/-Rows() (minibatch interpretation) and
 GetAsMatrixCols()/-Rows() (2D matrix interpretation)

---
 Source/CNTK/ModelEditLanguage.cpp             |   4 +-
 Source/CNTK/NDLUtil.h                         |   9 -
 Source/CNTK/SimpleNetworkBuilder.cpp          |  16 +-
 Source/Common/Include/TensorShape.h           |  14 +
 .../CompositeComputationNodes.h               | 155 +++---
 .../ComputationNetwork.cpp                    |  42 +-
 .../ComputationNetwork.h                      |  53 +--
 .../ComputationNetworkEvaluation.cpp          |  25 +-
 .../ComputationNetworkLib/ComputationNode.cpp |  99 ++--
 .../ComputationNetworkLib/ComputationNode.h   | 357 +++++++-------
 .../ConvolutionalNodes.h                      |  54 +--
 Source/ComputationNetworkLib/EsotericNodes.h  | 146 +++---
 .../EvaluationCriterionNodes.h                |   7 +-
 .../InputAndParamNodes.h                      | 117 ++---
 .../LinearAlgebraNodes.h                      | 440 ++++++------------
 .../ComputationNetworkLib/NonlinearityNodes.h |  70 ++-
 Source/ComputationNetworkLib/RecurrentNodes.h |   8 +-
 Source/ComputationNetworkLib/ReshapingNodes.h | 112 ++---
 .../TrainingCriterionNodes.h                  | 176 ++++---
 Source/EvalDll/CNTKEval.cpp                   |   6 +-
 Source/SGDLib/DataReaderHelpers.h             |  80 ++--
 Source/SGDLib/MultiNetworksEvaluator.h        |   8 +-
 Source/SGDLib/MultiNetworksSGD.h              |   8 +-
 Source/SGDLib/SGD.cpp                         |  14 +-
 Source/SGDLib/SimpleEvaluator.h               |   3 +-
 .../Speech/README_Windows_Debug_commands.txt  |   6 +-
 26 files changed, 887 insertions(+), 1142 deletions(-)
diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index a8c2ddc4e72a..35b075102638 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -138,12 +138,12 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams);
 
         auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
-#if 1 // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
+#if 1   // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
         if (modelFormat == L"cntk_legacy_no_tensorlib")
         {
             cn->Read<ElemType>(params[1]);
             for (auto node : cn->FeatureNodes())
-                node->SetDims(TensorShape(node->GetNumRows()), 0); // pre-tensorlib InputValues had incorrect tensor dimensions
+                node->SetDims(TensorShape(node->GetSampleMatrixNumRows()), node->HasMBLayout()); // pre-tensorlib InputValues had incorrect tensor dimensions
             cn->CompileNetwork();
         }
         else
diff --git a/Source/CNTK/NDLUtil.h b/Source/CNTK/NDLUtil.h
index c0c532f801fe..1a9060362995 100644
--- a/Source/CNTK/NDLUtil.h
+++ b/Source/CNTK/NDLUtil.h
@@ -31,12 +31,6 @@ class NDLUtil
     {
     }
 
-    // FixupInputMinibatchSize - go through all the inputs and make sure they have a consistent minibatch size
-    void FixupInputMinibatchSize()
-    {
-        m_net->FixupInputMinibatchSize();
-    }
-
     // ProcessNDLConfig - Process the NDL script from a configuration string value
     // config - configuration string containing script
     void ProcessNDLConfig(const ConfigValue& config, bool fullValidate = false)
@@ -105,10 +99,7 @@ class NDLUtil
         SynchronousNodeEvaluator<ElemType> ndlEvaluator(m_net);
         NDLNode<ElemType>* lastNode = script->Evaluate(ndlEvaluator, L"", ndlPass, skipThrough);
         if (ndlPass == ndlPassResolve)
-        {
             SetOutputNodes(script);
-            FixupInputMinibatchSize();
-        }
         return lastNode;
     }
 
diff --git a/Source/CNTK/SimpleNetworkBuilder.cpp b/Source/CNTK/SimpleNetworkBuilder.cpp
index 8870c74d1023..cce402e29c3c 100644
--- a/Source/CNTK/SimpleNetworkBuilder.cpp
+++ b/Source/CNTK/SimpleNetworkBuilder.cpp
@@ -1753,7 +1753,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildUnidirectionalLSTMNet
                 input = output;
             }
 
-            size_t idim = input->GetNumRows();
+            size_t idim = input->GetSampleMatrixNumRows();
             assert(m_lookupTabelOrderSizes.size() == m_streamSizes.size());
 
             e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"Embedding%d", idx), m_layerSizes[1], idim / m_lookupTabelOrderSizes[idx]);
@@ -2069,7 +2069,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildBiDirectionalLSTMNetw
                 input = output;
             }
 
-            size_t idim = input->GetNumRows();
+            size_t idim = input->GetSampleMatrixNumRows();
             assert(m_lookupTabelOrderSizes.size() == m_streamSizes.size());
 
             e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"Embedding%d", idx), m_layerSizes[1], idim / m_lookupTabelOrderSizes[idx]);
@@ -2295,7 +2295,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
     unsigned long randomSeed = 1;
 
     ComputationNodePtr input, w, b, output, label, prior, scaledLogLikelihood;
-    shared_ptr<PreComputedNode<ElemType>> pcNodePtr;
+    shared_ptr<PreComputedNodeBase<ElemType>> pcNodePtr;
 
     File fstream(dbnModelFileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
 
@@ -2354,11 +2354,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
             contextStdDev.TransferFromDeviceToDevice(CPUDEVICE, m_deviceId, true, false, false);
 
             w = builder.Mean(input, L"MeanOfFeatures");
-            static_pointer_cast<PreComputedNode<ElemType>>(w)->SideLoadFromMatrix(contextMean);
+            static_pointer_cast<PreComputedNodeBase<ElemType>>(w)->SideLoadFromMatrix(contextMean);
             w->SetParameterUpdateRequired(false);
 
             b = builder.InvStdDev(input, L"InvStdOfFeatures");
-            static_pointer_cast<PreComputedNode<ElemType>>(b)->SideLoadFromMatrix(contextStdDev);
+            static_pointer_cast<PreComputedNodeBase<ElemType>>(b)->SideLoadFromMatrix(contextStdDev);
             b->SetParameterUpdateRequired(false);
 
             output = builder.PerDimMeanVarNormalization(input, w, b, L"MVNormalizedFeatures");
@@ -2418,7 +2418,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
         assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);
 
         prior = builder.Mean(label, L"Prior");
-        static_pointer_cast<PreComputedNode<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
+        static_pointer_cast<PreComputedNodeBase<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
         prior->SetParameterUpdateRequired(false);
     }
     else // pretrained network - need to add output layer, initalize
@@ -2431,7 +2431,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
         else
             std::runtime_error("Output layer size must be specified when converting pretrained network, use outputLayerSize=");
 
-        size_t penultimateSize = input->GetNumRows();
+        size_t penultimateSize = input->GetSampleMatrixNumRows();
 
         wstring nameOfW = msra::strfun::wstrprintf(L"W%d", i);
         wstring nameOfB = msra::strfun::wstrprintf(L"B%d", i);
@@ -2450,7 +2450,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
         {
             Matrix<ElemType> zeros = Matrix<ElemType>::Zeros(outputLayerSize, 1, m_deviceId);
             prior = builder.Mean(label, L"Prior");
-            static_pointer_cast<PreComputedNode<ElemType>>(prior)->MarkComputed(false);
+            static_pointer_cast<PreComputedNodeBase<ElemType>>(prior)->MarkComputed(false);
             prior->Value().SetValue(zeros);
         }
     }
diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h
index 5b8c5fda0142..8d7bc6330e38 100644
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@@ -383,6 +383,7 @@ struct TensorShape
     {
         return m_dims == other.m_dims;
     }
+    bool operator!=(const TensorShape& other) const { return !operator==(other); }  // duh!
 
     // verify that this refers to a dense matrix (no strides)
     void VerifyIsDense() const
@@ -622,6 +623,19 @@ struct TensorShape
         return *this;
     }
 
+    // compare two TensorShapes, whether they are compatible, considering padding and broadcasting
+    bool IsElementwiseCompatibleWith(const TensorShape & other) const
+    {
+        for (size_t i = 0; i < m_dims.size(); i++)
+        {
+            size_t dim = m_dims[i];
+            size_t otherDim = i < other.size() ? other[i] : 1;
+            if (dim != otherDim && dim != 1 && otherDim != 1)   // dims mismatch, and neither is broadcasting
+                return false;
+        }
+        return true;
+    }
+
     // pretty-printing. Returns tensor dims in the form "I x J x K".
     operator std::string() const
     {
diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index 300537413da9..d1147594f17c 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -19,34 +19,28 @@
 //composite nodes can save memory, computation, or both
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-    // -----------------------------------------------------------------------
-    // PreComputedNode
-    // -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
+// PreComputedNodeBase
+// base class for nodes requiring pre-computation
+// -----------------------------------------------------------------------
 
-    //this is a noninstantiable virtual class, all nodes require precomputation should derive from it
 template <class ElemType>
-class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
-    {
+class PreComputedNodeBase : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
+{
     typedef ComputationNodeNonLooping<ElemType> Base;
     UsingComputationNodeMembers;
     using Base::OperationName;
 
     public:
-        //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
-        //DeclareConstructorFromConfigWithNumInputs(PreComputedNode);
-    PreComputedNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name),
-            m_hasComputed(false)
-    {
-    }
+        PreComputedNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
+            : Base(deviceId, name), m_hasComputed(false)
+        {
+        }
 
         // interface through which this node is operated on are these two functions
 
         // check whether node has already undergone precomputation
-    virtual bool HasComputed() const
-    {
-        return m_hasComputed;
-    }
+        virtual bool HasComputed() const { return m_hasComputed; }
 
         // call this with 'false' at start and with 'true' at end
         // This is used for resetting and updating from accumulators.
@@ -56,16 +50,13 @@ class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
             CreateMatrixIfNull(m_value);
         }
 
-    virtual bool RequiresPreCompute() const override
-    {
-        return true;
-    }
+        virtual bool RequiresPreCompute() const override { return true; }
 
         virtual void Save(File& fstream) const override
         {
             Base::Save(fstream);
             fstream << m_hasComputed;
-        fstream << Value(); // TODO: why serialize if not yet computed?
+            fstream << Value();
         }
 
         virtual void Load(File& fstream, size_t modelVersion) override
@@ -81,25 +72,25 @@ class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
             Base::DumpNodeInfo(printValues, fstream);
 
             char str[4096];
-            sprintf(str, "[%lu,%lu]  ", GetNumRows(), GetNumCols());
+            sprintf(str, "[%s]  ", string(GetSampleLayout()).c_str());
             fstream << string(str);
-        sprintf(str, "HasComputed=%ls", HasComputed() ? L"true" : L"false");
+            sprintf(str, "HasComputed=%ls", HasComputed() ? L"true" : L"false");
             fstream << string(str);
 
             PrintNodeValuesToFile(printValues, fstream);
         }
 
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+        virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
         {
             Base::Validate(isFinalValidationPass);
             if (!Input(0)->HasMBLayout())
                 InvalidArgument("%ls %ls operation requires its input to come in minibatches of samples.", NodeName().c_str(), OperationName().c_str());
-        m_pMBLayout = nullptr; // this node does not hold mini-batch data
+            m_pMBLayout = nullptr; // this node does not hold mini-batch data
 
             if (!m_hasComputed) // this node retains state, and state gets destroyed by Resize(), so we must be careful
-                SetDims(Input(0)->GetSampleLayout(), 1);
-            else
-                VerifyDims(Input(0)->GetNumRows(), 1);
+                SetDims(Input(0)->GetSampleLayout(), false);
+            else if (!GetSampleLayout().IsElementwiseCompatibleWith(Input(0)->GetSampleLayout()))
+                InvalidArgument("%ls %ls operation: Precomputed parameter does not match input dimensions.", NodeName().c_str(), OperationName().c_str());
         }
 
         virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -107,7 +98,7 @@ class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
             Base::CopyTo(nodeP, newName, flags);
             if (flags & CopyNodeFlags::copyNodeValue)
             {
-                auto node = dynamic_pointer_cast<PreComputedNode<ElemType>>(nodeP);
+                auto node = dynamic_pointer_cast<PreComputedNodeBase<ElemType>>(nodeP);
                 node->m_hasComputed = m_hasComputed;
             }
         }
@@ -115,10 +106,12 @@ class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
         // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model
         virtual void SideLoadFromMatrix(const Matrix<ElemType>& value)
         {
+            if (value.GetNumCols() != 1)
+                InvalidArgument("SideLoadFromMatrix: Side-loading is only supported for column vectors.");
             CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
-            SetDims(TensorShape(value.GetNumRows()), value.GetNumCols());
+            SetDims(TensorShape(value.GetNumRows()), false);
         }
 
     public:
@@ -135,15 +128,15 @@ class PreComputedNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
     // -----------------------------------------------------------------------
 
 template <class ElemType>
-    class MeanInvStdDevNodeBase : public PreComputedNode<ElemType>, public NumInputs<1>
-    {
-    typedef PreComputedNode<ElemType> Base;
+class MeanInvStdDevNodeBase : public PreComputedNodeBase<ElemType>, public NumInputs<1>
+{
+    typedef PreComputedNodeBase<ElemType> Base;
     UsingPreComputedNodeMembers;
         //static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; }
     public:
         //DeclareConstructorFromConfigWithNumInputs(MeanInvStdDevNodeBase);
     MeanInvStdDevNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
-        : PreComputedNode<ElemType>(deviceId, name),
+        : PreComputedNodeBase<ElemType>(deviceId, name),
             m_numSamples(SIZE_MAX)
     {
     }
@@ -161,7 +154,7 @@ template <class ElemType>
             m_numSamples = SIZE_MAX;
         }
 
-    virtual void /*PreComputedNode::*/ MarkComputed(const bool hasComputed, size_t numSamples = 0)
+    virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed, size_t numSamples = 0)
         {
             Base::MarkComputed(hasComputed);
         if (!m_hasComputed) // initialize
@@ -236,7 +229,7 @@ template <class ElemType>
         : Base(deviceId, name)
     {
         }
-    virtual void /*PreComputedNode::*/ MarkComputed(const bool hasComputed)
+    virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed)
         {
             Base::MarkComputed(hasComputed);
         if (!m_hasComputed) // initialize accumulation
@@ -270,7 +263,7 @@ template <class ElemType>
         if (totalNumSamples == 0)
             totalNumSamples = 1; // 0/0=1 in this context
             Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
-                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, avg);
 #if NANCHECK
             avg.HasNan("Mean-avg");
@@ -308,14 +301,14 @@ template <class ElemType>
     {
     }
 
-    virtual void /*PreComputedNode::*/ MarkComputed(const bool hasComputed) override
+    virtual void /*PreComputedNodeBase::*/ MarkComputed(const bool hasComputed) override
         {
             Base::MarkComputed(hasComputed);
 
             if (!m_hasComputed) // initialize
             {
                 // reset accumulators
-                size_t inputDim = Input(0)->GetNumRows();
+                size_t inputDim = Input(0)->GetSampleMatrixNumRows();
                 m_mean.Resize(inputDim, 1);
                 m_var.Resize(inputDim, 1);
                 m_mean.SetValue(0);
@@ -366,7 +359,7 @@ template <class ElemType>
         if (totalNumSamples == 0)
             totalNumSamples = 1; // 0/0=1 in this context
             Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, samples, false,
-                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, m_mean);
 
             m_temp -= m_mean;
@@ -377,14 +370,18 @@ template <class ElemType>
             m_temp.AssignElementPowerOf(m_temp, 2);
 
             Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / totalNumSamples, m_temp, false,
-                                                     ConstOnes(samples.GetNumCols(), 1, samples.GetDeviceId()),
+                                                     ConstOnes(Input(0)->Value().GetNumCols(), 1, samples.GetDeviceId()),
                                                  false, (ElemType) m_numSamples / totalNumSamples, m_var);
 
 #if NANCHECK
             m_var.HasNan("InvStdDev-m_var");
 #endif
 
-            m_numSamples += samples.GetNumCols();
+#if 0       // BUGBUG: This is the correct version, but it will break test cases, so do this later. MeanNode does it right already.
+            m_numSamples += Input(0)->GetMBLayout()->GetActualNumSamples();
+#else
+            m_numSamples += Input(0)->Value().GetNumCols();  // BUGBUG: Should be -> GetActualNumSamples().
+#endif
         }
 
         virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -436,9 +433,9 @@ template <class ElemType>
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
         {
-            //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+            // only feature (input0) and output needs to be sliced
+            auto sliceInput0Value = Input(0)->ValueFor(fr);
+            auto sliceOutputValue = ValueFor(fr);
 
             ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
         }
@@ -490,32 +487,20 @@ template <class ElemType>
                     "type or (Mean, InvStdDev) so that the values will be saved.");
             }
 
-            {
-                size_t rows = (Input(1)->GetNumRows() == 0) ? Input(0)->GetNumRows() : Input(1)->GetNumRows();
-                ValidateInferInputDims(1, rows, 1);
-            }
-
-            {
-                size_t rows = (Input(2)->GetNumRows() == 0) ? Input(0)->GetNumRows() : Input(2)->GetNumRows();
-                ValidateInferInputDims(2, rows, 1);
-            }
+            Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+            Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
 
             if (isFinalValidationPass)
             {
-                //match rows
-                if (!(Input(0)->GetNumRows() == Input(1)->GetNumRows() &&
-                    Input(2)->GetNumRows() == Input(1)->GetNumRows()))
-                {
-                    LogicError("PerDimMeanVarNormalizationNode: All inputs should have same number of rows.");
-                }
-
-                if (!(Input(1)->GetNumCols() == 1 && Input(2)->GetNumCols() == 1))
-                    LogicError("PerDimMeanVarNormalizationNode: Mean and InvStdDev should be a colum  vector.");
+                if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
+                    InvalidArgument("PerDimMeanVarNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
+                if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
+                    InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
             }
 
             // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
-            Input(1)->SetParameterUpdateRequired(false);
-        Input(2)->SetParameterUpdateRequired(false); //prevent learning
+            Input(1)->SetParameterUpdateRequired(false); // prevent learning
+            Input(2)->SetParameterUpdateRequired(false);
 
             SetDims(Input(0));
         }
@@ -553,9 +538,9 @@ template <class ElemType>
         //(feature-mean).*InvStdDev
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
         {
-            //only feature (input0) and output needs to be sliced
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+            // only feature (input0) and output needs to be sliced
+            auto sliceInput0Value = Input(0)->ValueFor(fr);
+            auto sliceOutputValue = ValueFor(fr);
 
             ForwardPropS(sliceOutputValue, sliceInput0Value, Input(1)->Value(), Input(2)->Value());
         }
@@ -614,33 +599,19 @@ template <class ElemType>
                     "LearnableParameter type or (Mean, InvStdDev) so that the values will be saved.");
             }
 
-            {
-                size_t rows = Input(1)->GetNumRows() == 0 ? Input(0)->GetNumRows() : Input(1)->GetNumRows();
-                ValidateInferInputDims(1, rows, 1);
-            }
-
-            {
-            size_t rows = Input(2)->GetNumRows() == 0 ? Input(0)->GetNumRows() : Input(2)->GetNumRows();
-                ValidateInferInputDims(2, rows, 1);
-            }
+            Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+            Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
 
             if (isFinalValidationPass)
             {
-            if (!(Input(0)->GetNumRows() == Input(1)->GetNumRows() && //match rows
-                    Input(2)->GetNumRows() == Input(1)->GetNumRows()))
-                {
-                    LogicError("PerDimMeanVarDeNormalizationNode: All inputs should have same number of rows.");
-                }
-
-                if (!(Input(1)->GetNumCols() == 1 && Input(2)->GetNumCols() == 1))
-                {
-                    LogicError("PerDimMeanVarDeNormalizationNode: Mean and InvStdDev should be a colum  vector.");
-                }
+                if (!Input(0)->HasMBLayout() || Input(1)->HasMBLayout() || Input(2)->HasMBLayout())
+                    InvalidArgument("PerDimMeanVarDeNormalizationNode: Inputs must be data, while mean and InvStdDev must be column vectors.");
+                if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
+                    InvalidArgument("PerDimMeanVarDeNormalizationNode: All inputs should have same sample layout.");
             }
 
-            //prevent learning
-            // TODO: Is this correct? Why not just skip propagating a gradient into these?
-            Input(1)->SetParameterUpdateRequired(false);
+            // TODO: Is this correct? Why not just skip propagating a gradient into these? We should not poke around in our children.
+            Input(1)->SetParameterUpdateRequired(false); // prevent learning
             Input(2)->SetParameterUpdateRequired(false);
 
             SetDims(Input(0));
@@ -700,7 +671,7 @@ class BatchModeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemT
 
             const size_t BUFLEN = 4096;
             WCHAR str[BUFLEN];
-            swprintf(str, BUFLEN, L"[%lu,%lu]  ", GetNumRows(), GetNumCols());
+            swprintf(str, BUFLEN, L"[%s%s]  ", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
             fstream << wstring(str);
             swprintf(str, BUFLEN, L"HasComputed=%ls", HasComputed() ? L"true" : L"false");
             fstream << wstring(str);
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 5364efe7f340..25aa10d45578 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -367,7 +367,7 @@ void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFor
 // node construction
 // -----------------------------------------------------------------------
 
-#if 0 // This function is not used. Is there value to keep it?
+#if 0 // This function is not used. Is there value to keep it? Maybe a librarified CNTK could use this to poke into InputValues?
     ComputationNodeBasePtr ComputationNetwork::SetNodeValue(const wstring & nodeName, const double value)
     {
         ComputationNodeBasePtr pNode = GetNodeFromName(nodeName);
@@ -379,15 +379,15 @@ void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFor
             AsNodePtr<LearnableParameter<double>>(pNode)->Value().SetValue((double)value);
         else if (pNode->RequiresPreCompute())
         {
-            if (IsNodePtr<PreComputedNode<float>>(pNode))
+            if (IsNodePtr<PreComputedNodeBase<float>>(pNode))
             {
-                auto preComputedNode = AsNodePtr<PreComputedNode<float>>(pNode);
+                auto preComputedNode = AsNodePtr<PreComputedNodeBase<float>>(pNode);
                 preComputedNode->Value().SetValue((float)value);
                 preComputedNode->MarkComputed(true);
             }
             else
             {
-                auto preComputedNode = AsNodePtr<PreComputedNode<double>>(pNode);
+                auto preComputedNode = AsNodePtr<PreComputedNodeBase<double>>(pNode);
                 preComputedNode->Value().SetValue((double)value);
                 preComputedNode->MarkComputed(true);
             }
@@ -408,30 +408,6 @@ void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& n
     learnableParameterNode->InitRandom(uniformInit, randomSeed + GetRandomSeedOffset(), initValueScale, initOnCPUOnly);
 }
 
-// FixupInputMinibatchSize - go through all the inputs and make sure they have a consistent minibatch size (after creation)
-void ComputationNetwork::FixupInputMinibatchSize()
-{
-    list<ComputationNodeBasePtr> inputs = GetNodesWithType(OperationNameOf(InputValue));
-    int minibatchMax = 0;
-    bool minibatchDifferent = false; // flag to see if all the values are already the same
-    for (ComputationNodeBasePtr node : inputs)
-    {
-        size_t cols = node->GetNumCols();
-        if (cols != minibatchMax)
-        {
-            if (minibatchMax != 0)
-                minibatchDifferent = true;
-            if (minibatchMax < cols)
-                minibatchMax = cols;
-        }
-    }
-    if (minibatchDifferent)
-    {
-        for (ComputationNodeBasePtr node : inputs)
-            node->SetNumCols(minibatchMax);
-    }
-}
-
 bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
 {
     // TODO: just use return!
@@ -485,8 +461,8 @@ void ComputationNetwork::GetNodesRequiringX(list<ComputationNodeBasePtr>& nodesR
 list<ComputationNodeBasePtr> ComputationNetwork::GetNodesRequiringPreComputation(const ComputationNodeBasePtr& rootNode, bool checkComputed)
 {
     list<ComputationNodeBasePtr> nodesRequiringX;
-    GetNodesRequiringX<PreComputedNode<float>>(nodesRequiringX, rootNode, checkComputed);
-    GetNodesRequiringX<PreComputedNode<double>>(nodesRequiringX, rootNode, checkComputed);
+    GetNodesRequiringX<PreComputedNodeBase<float>>(nodesRequiringX, rootNode, checkComputed);
+    GetNodesRequiringX<PreComputedNodeBase<double>>(nodesRequiringX, rootNode, checkComputed);
     return nodesRequiringX;
 }
 
@@ -794,10 +770,8 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
     for (const auto& x : allnodes)
     {
         line.clear();
-        size_t nrows = x->GetNumRows();
-        size_t ncols = x->GetNumCols();
-        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%d,%d]\\n%ls\" ] ;\n",
-                                        x->GetName().c_str(), x->GetName().c_str(), nrows, ncols,
+        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%s]\\n%ls\" ] ;\n",
+                                        x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->HasMBLayout() ? " x *" : "",
                                         x->OperationName().c_str());
         fstream << line;
     }
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index a41294b70820..34d82c412d43 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -287,13 +287,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // determine the actual MB size from the feature nodes
     // This returns max number of columns over the feature nodes.
     // Note that if we have multiple slices, MB size != #frames.
+    // BUGBUG: This will break once we have inconsistent layouts.
     size_t DetermineActualMBSizeFromFeatures() const
     {
         size_t actualMBSize = 0;
 
         const auto& featureNodes = FeatureNodes(); // TODO: a getter; should be called GetFeatureNodes()
         for (auto& nodeIter : featureNodes)
-            actualMBSize = max(actualMBSize, nodeIter->GetNumCols());
+            actualMBSize = max(actualMBSize, nodeIter->GetMBLayout()->GetNumCols());
 
         return actualMBSize;
     }
@@ -463,45 +464,23 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     }
 
     // these are specified as such by the user
-    inline std::vector<ComputationNodeBasePtr>& FeatureNodes()
-    {
-        return m_features;
-    }
-    inline const std::vector<ComputationNodeBasePtr>& FeatureNodes() const
-    {
-        return m_features;
-    }
-    inline std::vector<ComputationNodeBasePtr>& LabelNodes()
-    {
-        return m_labels;
-    }
-    inline std::vector<ComputationNodeBasePtr>& FinalCriterionNodes()
-    {
-        return m_finalCriteria;
-    }
+    inline       std::vector<ComputationNodeBasePtr>& FeatureNodes()       { return m_features; }
+    inline const std::vector<ComputationNodeBasePtr>& FeatureNodes() const { return m_features; }
+    inline std::vector<ComputationNodeBasePtr>& LabelNodes()               { return m_labels; }
+    inline std::vector<ComputationNodeBasePtr>& FinalCriterionNodes()      { return m_finalCriteria; }
 
     inline std::vector<ComputationNodeBasePtr> CriterionNodesFrom(const wstring& criterionNodeName)
     {
         ComputationNodeBasePtr node = GetNodeFromName(criterionNodeName);
         ValidateSubNetwork(node);
-        if (node->GetNumRows() != 1 || node->GetNumCols() != 1)
-            InvalidArgument("the criterionNodeName specified in the config file is not a valid training or eval criterion node.");
-        // TODO: test this, then remove this comment
-        return std::vector<ComputationNodeBasePtr>{node};
+        if (node->HasMBLayout() || node->GetSampleLayout().GetNumElements() != 1)
+            InvalidArgument("%ls %ls operation is not a valid training or eval criterion node.", node->NodeName().c_str(), node->OperationName().c_str());
+        return std::vector<ComputationNodeBasePtr>{ node };
     }
 
-    inline std::vector<ComputationNodeBasePtr>& EvaluationNodes()
-    {
-        return m_evalNodes;
-    }
-    inline std::vector<ComputationNodeBasePtr>& OutputNodes()
-    {
-        return m_outputNodes;
-    }
-    inline std::vector<ComputationNodeBasePtr>& PairNodes()
-    {
-        return m_pairNodes;
-    }
+    inline std::vector<ComputationNodeBasePtr>& EvaluationNodes() { return m_evalNodes; }
+    inline std::vector<ComputationNodeBasePtr>& OutputNodes()     { return m_outputNodes; }
+    inline std::vector<ComputationNodeBasePtr>& PairNodes()       { return m_pairNodes; }
 
     // -----------------------------------------------------------------------
     // node access
@@ -685,9 +664,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
             node->ZeroGradientsOfInputs();
     }
 
-    // FixupInputMinibatchSize - go through all the inputs and make sure they have a consistent minibatch size (after creation)
-    void FixupInputMinibatchSize();
-
 private:
     bool IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr);
     void PrintComputationTree(const ComputationNodeBasePtr& rootNode, const bool forwardCompute, const bool printMatrices = false);
@@ -833,10 +809,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         virtual void ForwardProp(const FrameRange&) override;
         virtual void EndForwardProp() override;
         virtual void BeginBackprop() override;
-        virtual void BackpropTo(const size_t inputIndex, const FrameRange&) override
-        {
-            NOT_IMPLEMENTED;
-        } // ugh, call Backprop() instead
+        virtual void BackpropTo(const size_t inputIndex, const FrameRange&) override { NOT_IMPLEMENTED; }
         virtual void EndBackprop() override;
         virtual void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override;
         virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 326b302fe751..1720aa1bd317 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -52,7 +52,7 @@ static bool SetGradientToScalarOne(ComputationNodeBasePtr nodep)
     bool hasMatchingType = (node != nullptr);
     if (hasMatchingType)
     {
-        node->VerifyDims(1, 1);
+        node->Value().VerifySize(1, 1);
         node->Gradient().Resize(1, 1);
         node->Gradient().SetValue((ElemType) 1.0);
     }
@@ -432,7 +432,6 @@ void ComputationNetwork::CompileNetwork()
     // :)
 
     // STEP: Some final details.
-    FixupInputMinibatchSize(); // post-fix MB sizes in InputValues(). Will not be needed with next-gen reader.
     ResetEvalTimeStamps();     // invalidate all m_value fields. Really belongs into StartEvaluateMinibatchLoop()
 
     fprintf(stderr, "\nPost-processing network complete.\n");
@@ -612,7 +611,7 @@ void ComputationNetwork::ValidateSubNetwork(const ComputationNodeBasePtr& rootNo
     for (auto& node : nodes)
     {
         // nodes must output non-zero dimensional data, otherwise assume user error
-        if (node->GetNumRows() == 0 && (node->GetMBLayout() || node->GetNumCols() == 0))
+        if (node->GetSampleLayout().GetNumElements() == 0)
             RuntimeError("%ls operation has 0 elements", node->NodeName().c_str());
     }
     fprintf(stderr, "\n\n");
@@ -633,6 +632,12 @@ void ComputationNetwork::ValidateSubNetwork(const ComputationNodeBasePtr& rootNo
     }
 }
 
+// helper to discover dimension changes
+static pair<TensorShape, bool> GetDims(const ComputationNodeBasePtr & node)
+{
+    return make_pair(node->GetSampleLayout(), node->HasMBLayout());
+}
+
 void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo)
 {
     todo = 0; // returns how many nodes are to be redone
@@ -655,15 +660,15 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
             // got at least one child: it makes sense to call Validate()
             // keep state
             MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout();
-            auto dim = node->GetDims();
-            vector<pair<size_t, size_t>> childDims;
+            auto dim = GetDims(node);
+            vector<pair<TensorShape, bool>> childDims;
             for (auto& child : children)
-                childDims.push_back(child->GetDims());
+                childDims.push_back(GetDims(child));
             auto sampleLayout = node->GetSampleLayout();
             // We do call validate(final) as many times as needed, since stuff may have changed underneath.
             node->PrintSelfBeforeValidation();
             node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference
-            fprintf(stderr, " -> [%lu [%s], %s%lu]", node->GetNumRows(), string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? "MBSize " : "", node->GetNumCols());
+            fprintf(stderr, " -> [%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
             node->m_visited = true;
             // also take the opportunity to propagate m_needsGradient
             auto needsGradient = node->m_needsGradient;
@@ -672,10 +677,10 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
             // check state --node will be valid if all nodes have been visited and node has not been updated
             bool unchanged = true;
             unchanged &= (oldMBLayoutPtr == node->GetMBLayout());
-            unchanged &= (dim == node->GetDims());
-            vector<pair<size_t, size_t>> newChildDims;
+            unchanged &= (dim == GetDims(node));
+            vector<pair<TensorShape, bool>> newChildDims;
             for (auto& child : children)
-                newChildDims.push_back(child->GetDims());
+                newChildDims.push_back(GetDims(child));
             unchanged &= (childDims == newChildDims);
             unchanged &= (sampleLayout == node->GetSampleLayout());
             unchanged &= (needsGradient == node->m_needsGradient);
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 86abe774dc5f..3444ca412696 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -66,21 +66,11 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
 
     ValidateInferBinaryInputDims();
 
-    size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-    size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
-
-#if 1 //ndef ENABLE_TENSORVIEW
-    // TODO: This test will go away once we switch to full tensor lib.
-    if (isFinalValidationPass && !((rows0 == rows1 && (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || cols0 == cols1)) ||                                                               // matching size (obvious case)
-                                   (allowMultiples && (rows0 == 1 || rows1 == 1) && (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || cols0 == cols1)) ||                                 // one is row vec
-                                   (allowMultiples && ((!HasMBLayout() && cols0 > cols1 && cols0 % cols1 == 0) || (cols0 == 1 && rows1 % rows0 == 0) || (cols1 == 1 && rows0 % rows1 == 0))))) // TODO: ^^ I don't understand the asymmetry of this last one
+    if (isFinalValidationPass &&
+        Input(0)->GetMBLayout() != Input(1)->GetMBLayout() && Input(0)->HasMBLayout() && Input(1)->HasMBLayout())
     {
-        LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+        LogicError("MB layouts in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
     }
-#else
-    rows0;
-    rows1;
-#endif
 
     // result has tensor shape with dimensions being the max over both
     let shape0 = GetInputSampleLayout(0);
@@ -103,16 +93,18 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
                             NodeName().c_str(), OperationName().c_str(), string(shape0).c_str(), string(shape1).c_str());
     }
 
-    SetDims(TensorShape(dims), GetMBLayout() ? GetMBLayout()->GetNumCols() : max(cols0, cols1));
+    SetDims(TensorShape(dims), HasMBLayout());
 }
+
 // unary reduce-to-(1,1) operation, e.g. MatrixL1RegNode
 void ComputationNodeBase::ValidateUnaryReduce(bool isFinalValidationPass)
 {
     assert(m_inputs.size() == 1);
     ComputationNodeBase::Validate(isFinalValidationPass);
     m_pMBLayout = nullptr; // this node does not hold mini-batch data
-    SetDims(TensorShape(1), 1);
+    SetDims(TensorShape(1), false);
 }
+
 // binary reduce-to-(1,1) operation, e.g. CrossEntropyWithSoftmaxNode
 // Currently only called by criterion nodes.
 // This function also infers child LearnableParameters. In case you wonder why this is needed for criterion nodes, there are edge cases, e.g. a
@@ -123,15 +115,16 @@ void ComputationNodeBase::ValidateBinaryReduce(bool isFinalValidationPass)
     m_pMBLayout = nullptr; // this node does not hold mini-batch data
     ValidateInferBinaryInputDims();
     if (isFinalValidationPass &&
-        !(Input(0)->GetNumRows() == Input(1)->GetNumRows() &&
-          (Input(0)->HasMBLayout() || (Input(0)->GetNumCols() == Input(1)->GetNumCols()))))
-        LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
-    SetDims(TensorShape(1), 1);
+        !(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) &&   // TODO: Do we need broadcasting for these cases?
+          (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout())))
+        LogicError("The Matrix dimensions or MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+    SetDims(TensorShape(1), false);
 }
+
 // helper function for validation
-// In bad cases of convolution, dimensions are quite complex to know.
-// This is a feature that allows a node to help resizing its input node to the expected value.
-// TODO: This is shaky by design.
+// In complex cases of convolution, dimensions are quite difficult for a user to know/derive.
+// This is a feature that allows a node to help resizing its input node to the expected value
+// iff that input must be a learnable parameter.
 void ComputationNodeBase::ValidateInferBinaryInputDims()
 {
     // limited inference of children dimensions
@@ -144,27 +137,44 @@ void ComputationNodeBase::ValidateInferBinaryInputDims()
         auto in = Input(index);
         auto other = Input(1 - index);
         // borrow any unset dimension on one input from the other input
-        size_t rows = in->GetNumRows() == 0 ? other->GetNumRows() /*borrow from peer*/ : in->GetNumRows() /*keep as is*/;
-        size_t cols = (!in->HasMBLayout() && in->GetNumCols() == 0) ? other->GetNumCols() /*borrow from peer*/ : in->GetNumCols() /*keep as is*/;
-        ValidateInferInputDims(index, rows, cols);
+        in->ValidateInferInputDimsFrom(other->GetSampleLayout());
     }
 }
-// BUGBUG: Change this to take a TensorShape.
+
+// in case of an error, we just back out, and leave it to outside code to detect errors
 template <class ElemType>
-void ComputationNode<ElemType>::ValidateInferInputDims(size_t i, size_t rows, size_t cols) //override final
+void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape & otherShape)
 {
-    if (Input(i)->OperationName() == OperationNameOf(LearnableParameter) && Input(i)->GetNumRows() == 0)
+    if (OperationName() != OperationNameOf(LearnableParameter))   // only infer LearnableParameters (we can't propagate further)
+        return;
+
+    // see where we stand with our shape
+    bool hasMissingDims = m_sampleLayout.GetRank() == 0 || m_sampleLayout.GetNumElements() == 0;
+    if (!hasMissingDims)        // all there--nothing to infer
+        return;
+
+    // infer at least one dimension
+    if (otherShape.GetRank() == 0 || otherShape.GetNumElements() == 0)
+        return;// LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
+
+    // if no dimensions have been set at all, copy otherShape
+    // Don't verify dimensions in this case, because the node may have explicitly been defined as a vector of 0 elements.
+    bool hasAnyDim = false;
+    for (auto dim : m_sampleLayout.GetDims())
+        hasAnyDim |= dim != 0;
+    if (!hasAnyDim)
+        m_sampleLayout = otherShape;
+    else if (hasMissingDims)    // we got a pre-existing shape: If it has zeroes, we fill them in from otherShape
     {
-        if (rows == 0 || cols == 0)
-            LogicError("ValidateInferInputDims: Inferred matrix must not be empty.");
-        Input(i)->SetDims(rows == Input(i)->GetNumRows() ? Input(i)->GetSampleLayout() : TensorShape(rows), cols);
-        // BUGBUG: This will loose tensor shape.
-        Input(i)->Validate(true); // validate it properly
-        // BUGBUG: ^^ Validate() calls are under the control of ValidateSubNetwork(). E.g. it checks whether something has changed & re-validates until there is no change. If we validate here, the change goes unnoticed.
-        // big BUGBUG: This should do random initialization as requested by user in the first place.
-        Input(i)->Value().SetValue(0);
-        fprintf(stderr, "ValidateInferInputDims: %ls %ls operation inferred, resized to (%d x %d), and (incorrectly) initialized to 0.\n", Input(i)->NodeName().c_str(), Input(i)->OperationName().c_str(), (int) rows, (int) cols);
+        if (m_sampleLayout.GetRank() != 0 && m_sampleLayout.GetRank() != otherShape.GetRank())
+            return;// LogicError("ValidateInferInputDimsFrom: Inferred dimensions must match in rank.");
+        SmallVector<size_t> newDims = m_sampleLayout.GetDims();
+        for (size_t i = 0; i < m_sampleLayout.GetRank(); i++)
+            if (newDims[i] == 0)
+                newDims[i] = otherShape[i];
+        m_sampleLayout = TensorShape(newDims);
     }
+    fprintf(stderr, "Tensor shape of %ls %ls operation was inferred as [%s].\n", NodeName().c_str(), OperationName().c_str(), string(m_sampleLayout).c_str());
 }
 
 // -----------------------------------------------------------------------
@@ -180,8 +190,6 @@ size_t ComputationNodeBase::DetermineElementwiseTensorRank() const
     for (size_t i = 0; i < GetNumInputs(); i++)
     {
         size_t rank = Input(i)->GetSampleLayout().GetRank();
-        if (!HasMBLayout()) // no MBLayout: last dim is column dimension
-            rank++;
         if (maxRank < rank)
             maxRank = rank;
     }
@@ -192,12 +200,13 @@ size_t ComputationNodeBase::DetermineElementwiseTensorRank() const
 TensorShape ComputationNodeBase::GetTensorShape(size_t rank) const
 {
     // If we have an MB layout then add the necessary dimensions. If we have none, then absorb the column dimension.
-    TensorShape tensorShape = GetSampleLayout(); // TODO: Can this tensor arbitrary strides? In case it came out of a Slice, Reshape, or Transpose op in-place
-    if (!HasMBLayout())
-        tensorShape.AppendInPlace(tensorShape.GetRank(), GetNumCols()); //  last dim is column dimension
-    // TODO: This is not nice! Instead, if no MBLayout then have sample layout explain whole matrix.
-    else
-        tensorShape.AppendInPlace(rank, GetMBLayout()->GetNumParallelSequences()).AppendInPlace(rank + 1, GetMBLayout()->GetNumTimeSteps());
+    TensorShape tensorShape = GetSampleLayout(); // TODO: Can this tensor have arbitrary strides? In case it came out of a Slice, Reshape, or Transpose op in-place
+    if (HasMBLayout())
+    {
+        size_t i = rank;
+        tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumParallelSequences());
+        tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumTimeSteps());
+    }
     return tensorShape;
 }
 
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 15801df2a126..04d3d163fe78 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -332,44 +332,113 @@ class ComputationNodeBase : public IComputationNode,
 
         // dimensions
 
-        size_t GetNumRows() const { return m_sampleLayout.GetNumElements(); }
-        size_t GetNumCols() const
+        // The value of a node is a tensor in one of two variants:
+        //
+        //  - single matrix, vector, tensor
+        //     - m_sampleLayout contains the shape. Accessed through GetSampleLayout().
+        //     - m_pMBLayout is null
+        //  - minibatch data
+        //     - consists of many samples which are all tensors of m_sampleLayout
+        //     - adds two additional tensor dimensions, time step and parallel sequence
+        //       These change for each minibatch and are unknown during validation.
+        //     - m_sampleLayout is the tensor shape of the samples
+        //     - m_pMBLayout defines the number of time steps and parallel sequences (="tensor shape" of the minibatch)
+        //       Accessed through GetMBLayout(); test for through HasMBLayout().
+        //
+        // The values can be accessed in three ways:
+        //
+        //  - as a tensor
+        //     - GetTensorShape() forms the joint tensor that incorporates both m_sampleLayout and, if present, m_pMBLayout
+        //        - Elementwise tensor operations operate on these.
+        //        - If no MBLayout is present in one of multiple elementwise operands, it will be interpreted as a one-sample minibatch that broadcasts to all samples.
+        //     - learnable parameters hold tensors that are not minibatches
+        //  - as a sample matrix
+        //     - many nodes do not care about the specific sample-tensor dimensions
+        //     - but may care about selecting a single time step out of a minibatch
+        //     - minibatch: each matrix column contains a sample tensor flattened, with one column per time step and parallel sequence
+        //     - tensor: one column containing the sample tensor flattened
+        //     - GetSampleMatrixNumRows(), GetSampleMatrixNumCols()
+        //     - this is how it is stored
+        //  - as a Matrix reference
+        //     - actual object is a 2D tensor without MB Layout
+        //     - ValueAsMatrix(), GradientAsMatrix() returns tensor as a 2D Matrix object
+        //     - nodes that do this are: TimesNode, DiagTimesNode, ConvolutionNode, NoiseContrastiveEstimationNode, ClassBasedCrossEntropyWithSoftmaxNode, TransposeNode, DiagonalNode
+
+        // interpretation as a set of samples
+        const TensorShape& GetSampleLayout() const { return m_sampleLayout; }
+        bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; }      // does it have a layout that is not just a vector?
+
+        // interpretation as sample matrix (each column is a sample, individual sample tensor dimensions do not matter for the operation)
+        size_t GetSampleMatrixNumRows() const
+        {
+            return m_sampleLayout.GetNumElements();
+        }
+        size_t GetSampleMatrixNumCols() const
+        {
+            if (HasMBLayout())
+                return GetMBLayout()->GetNumCols();
+            else
+                return 1;   // no layout: treat as 1-sample minibatch that is meant to broadcast
+        }
+        // determine if we are the output of an op over 'other', whether that would be a reduction, so that we need to mask
+        bool ReducesInTimeWrt(const ComputationNodeBasePtr & other) const
         {
-            if (HasMBLayout() && GetMBLayout()->GetNumCols() != m_numCols)
-                LogicError("GetNumCols: %ls %ls operation: Inconsistency between m_numCols and MBLayout", NodeName().c_str(), OperationName().c_str());
-            return m_numCols;
+            return GetSampleMatrixNumCols() < other->GetSampleMatrixNumCols();
+        }
+
+        // interpretation as a Matrix reference
+    private:
+        void CheckTensorIsMatrix() const
+        {
+            if (HasMBLayout())
+                LogicError("CheckTensorIsMatrix: Minibatch data cannot be interpreted as a single 2D tensor.");
+            else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2)  // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
+                LogicError("CheckTensorIsMatrix: Sample is now a 2D tensor.");
         }
-        pair<size_t, size_t> GetDims() { return make_pair(GetNumRows(), GetNumCols()); }
-        // TODO: add an overload SetDims(TensorShape, cols)
-        // Currently called from:
-        //  - Validate()   --intended
-        //  - LearnableParameterNode (init, load)
-        //  - InputValue (init, load)
-        //  - DelayedValueNodeBase (Init())
-        // use a different name for these:
-        //  - various unit tests
-        // deprecated ones:
-        //  - TimeReverseNode (first step--deprecate and/or move to UpdateMB... function)
-        //  - StrideTimesNode
-        //  - PairNetworkNode
-        //  - LSTMNode
-        // set our dimensions (rows, cols, sample layout)
-        // TODO: Separate SetDims() into version with and without MBLayout.
-    void SetDims(const TensorShape& sampleLayout, size_t cols)
+    public:
+        size_t GetAsMatrixNumRows() const
+        {
+            CheckTensorIsMatrix();
+            return m_sampleLayout[0];
+        }
+        size_t GetAsMatrixNumCols() const
+        {
+            CheckTensorIsMatrix();
+            return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1;    // a column vector is also a Matrix
+        }
+
+        // set dimensions of the node
+        // The MBLayout must be set first, and 'isMinibatch' will be checked against it.
+        void SetDims(const TensorShape& sampleLayout, bool isMinibatch)
         {
+            if (HasMBLayout() != isMinibatch)
+                LogicError("SetDims: MBLayout must be set first, before calling this function.");
             m_sampleLayout = sampleLayout;
-            m_numCols = cols;
         }
         // copy dimensions (rows, cols, sample layout) from another node
-    void SetDims(const ComputationNodeBasePtr& node)
+        void SetDims(const ComputationNodeBasePtr& node)
         {
-            SetDims(node->GetSampleLayout(), node->GetNumCols());
+            SetDims(node->GetSampleLayout(), node->HasMBLayout());
         }
         // use this only for testing code. Everywhere else, be explicit on the TensorShape.
         void SetDims1(size_t rows, size_t cols)
         {
-            SetDims(TensorShape(rows), cols);
+            SetDims(TensorShape(rows, cols), false);
         }
+#if 0
+        // deprecated functions that did not distinguish the purpose
+        size_t GetNumRows() const { return GetSampleMatrixNumRows(); }
+        size_t GetNumCols() const
+        {
+            if (HasMBLayout() && GetNumMBCols() != m_numCols)
+                LogicError("GetNumCols: %ls %ls operation: Inconsistency between m_numCols (%d) and MBLayout (%d)", NodeName().c_str(), OperationName().c_str(), m_numCols, (int)GetNumMBCols());
+            else if (!HasMBLayout() && m_sampleLayout.GetRank() == 0 && m_numCols != 0)
+                LogicError("GetNumCols: %ls %ls operation: Inconsistency between m_numCols (%d) and sample layout (empty)", NodeName().c_str(), OperationName().c_str(), (int)m_numCols);
+            else if (!HasMBLayout() && m_sampleLayout.GetRank() > 0 && m_numCols != m_sampleLayout.GetDims().back())
+                LogicError("GetNumCols: %ls %ls operation: Inconsistency between m_numCols (%d) and last dim of sample layout [%s]", NodeName().c_str(), OperationName().c_str(), (int)m_numCols, string(m_sampleLayout).c_str());
+            return m_numCols;
+        }
+        size_t GetNumCols1() const { return m_numCols; }
         // update number of columns (in response to MB size)
         // TODO: this should go away, as m_numCols should be derived from MBLayout each time
         void SetNumCols(size_t cols)
@@ -382,34 +451,28 @@ class ComputationNodeBase : public IComputationNode,
             m_numCols = cols;
             // actual memory allocation happens elsewhere
         }
-    virtual void NotifyFunctionValuesMBSizeModified()
-    {
-    } // someone outside changed our m_value--update our internal dimensions
-        void VerifyDims(size_t rows, size_t cols)
+#endif
+        // get number of underlying matrix columns for test code only which does not create MBLayouts
+        size_t GetNumCols1() const { return m_numCols; }
+        virtual void NotifyFunctionValuesMBSizeModified() = 0;
+        void VerifyDims(const TensorShape & shape, bool isMinibatch)
         {
-            if (rows != GetNumRows() || cols != GetNumCols())
+            if (m_sampleLayout.GetDims() != shape.GetDims() || HasMBLayout() != isMinibatch)
             {
-                LogicError("VerifyDims: %ls %ls operation expected size %d x %d, but it is %d x %d",
+                LogicError("VerifyDims: %ls %ls operation expected a %s of [%s], but it is a %s of [%s]",
                            NodeName().c_str(), OperationName().c_str(),
-                       (int) rows, (int) cols, (int) GetNumRows(), (int) GetNumCols());
+                           isMinibatch ?   "minibatch" : "tensor", string(shape).c_str(),
+                           HasMBLayout() ? "minibatch" : "tensor", string(m_sampleLayout).c_str());
             }
         }
-    virtual void VerifyDims(ComputationNodeBasePtr node)
-    {
-        VerifyDims(node->GetNumRows(), node->GetNumCols());
-    }
-    virtual void VerifyDimsMatch() const = 0; // verify that m_value dimensions match ours
+        virtual void VerifyDims(ComputationNodeBasePtr node)
+        {
+            VerifyDims(node->GetSampleLayout(), node->HasMBLayout());
+        }
+        virtual void VerifyValueDims() const = 0; // verify that m_value dimensions match ours
 
-    const TensorShape& GetSampleLayout() const
-    {
-        return m_sampleLayout;
-    }
-    bool HasSampleLayout() const
-    {
-        return m_sampleLayout.GetRank() != 1;
-    }                                              // meaning does it have a layout that is not just a vector
     TensorShape GetTensorShape(size_t rank) const; // form the actual tensor that describes the full object
-    protected:
+protected:
     size_t DetermineElementwiseTensorRank() const;                          // determine tensor rank when considering all inputs with padding
     TensorShape GetTensorSliceFor(size_t rank, const FrameRange& fr) const; // form tensor shape of the slice referenced by FrameRange
     public:
@@ -417,24 +480,21 @@ class ComputationNodeBase : public IComputationNode,
         virtual double Get00Element() const = 0;
 
         // validation
-        // This is overridden by every node. This base class just checks for unconnected and empty inputs.
-    virtual void Validate(bool isFinalValidationPass) // main base validation function
+        // This is overridden by every node. This base class just checks for unconnected and empty inputs. Overrides must call their base version first.
+        virtual void Validate(bool isFinalValidationPass) // main base validation function
         {
             // check for NULL pointers
             for (size_t i = 0; i < m_inputs.size(); i++)
             {
                 if (!m_inputs[i])
-                RuntimeError("Validate: Input [%d] of %ls node '%ls' is empty (NULL, not connected).", (int) i, OperationName().c_str(), NodeName().c_str());
+                    RuntimeError("Validate: Input [%d] of %ls node '%ls' is empty (NULL, not connected).", (int) i, OperationName().c_str(), NodeName().c_str());
             }
             // check for empty inputs
             if (isFinalValidationPass)
             {
-            for (const auto& child : m_inputs)
-                {
-                    if (child->GetNumRows() == 0 || (!child->HasMBLayout() && child->GetNumCols() == 0))
-                        RuntimeError("%ls %ls operation: input %ls %ls has 0 elements.",
-                                     NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
-                }
+                for (const auto& child : m_inputs)
+                    if (child->GetSampleMatrixNumRows() == 0)
+                        RuntimeError("%ls %ls operation: input %ls %ls has 0 elements.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
             }
         }
         // helper functions for common cases
@@ -593,7 +653,7 @@ class ComputationNodeBase : public IComputationNode,
         virtual void PrintSelf(bool printMatrices = false) const = 0;
 
         // called in validation loop right before Validate()
-    virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const
+        virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const
         {
             fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
 
@@ -612,27 +672,16 @@ class ComputationNodeBase : public IComputationNode,
                         continue;
                     }
 
-                const char* mbSizeMark = child->m_pMBLayout ? "MBSize " : "";
-                if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
-                        fprintf(stderr, "%ls[%lu [%s] {W=%lu, H=%lu, C=%lu}, %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(),
-                                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0], mbSizeMark, child->GetNumCols());
-                    //BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
-                else if (child->m_sampleLayout.GetRank() > 1) // tensor: output the tensor dimensions
-                        fprintf(stderr, "%ls[%lu [%s], %s%lu]", child->NodeName().c_str(), child->GetNumRows(), string(child->m_sampleLayout).c_str(), mbSizeMark, child->GetNumCols());
+                    const char* mbSizeMark = child->m_pMBLayout ? " x *" : "";
+                    if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
+                        fprintf(stderr, "%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark,
+                                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]);
+                    // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
                     else
-                        fprintf(stderr, "%ls[%lu, %s%lu]", child->NodeName().c_str(), child->GetNumRows(), mbSizeMark, child->GetNumCols());
+                        fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark);
                 }
                 fprintf(stderr, ")");
             }
-#if 0
-            else
-            {
-                if (m_pMBLayout)
-                    fprintf(stderr, "[%lu, MBSize]", GetNumRows());
-                else
-                    fprintf(stderr, "[%lu, %lu]", GetNumRows(), GetNumCols());
-            }
-#endif
         }
 
     const std::wstring& NodeName() const
@@ -735,7 +784,7 @@ class ComputationNodeBase : public IComputationNode,
         }
 
     public:
-        virtual void ValidateInferInputDims(size_t i, size_t rows, size_t cols) = 0;
+        virtual void ValidateInferInputDimsFrom(const TensorShape &) = 0;
 
     protected:
     const TensorShape& GetInputSampleLayout(const size_t index) const
@@ -969,33 +1018,30 @@ template <class ElemType>
         // helper to load m_value from a stream
         // This function updates the dimensions to a 2D matrix.
         // If a different tensor layout is associated with this, it must be implanted afterwards.
+        // Nodes that call this never have an MB layout.
         void LoadValue(File& fstream)
         {
             CreateMatrixIfNull(m_value);
             fstream >> Value();
             // above reads dimensions, so we must update our own dimensions
-            SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols());
+            SetDims(TensorShape(Value().GetNumRows(), Value().GetNumCols()), false);
         }
 
-        // reader updated m_functionValue--update our internal state, i.e. m_numCols
-        // This is meant for the case when a new minibatch was read. Hence, the only change that is allowed if for column dimension.
-        // TODO: Redundant with the MBLayout. Just verify here. Update comment above if this works.
+        // reader updated m_functionValue and MBLayout--ensure our internal state is consistent
         virtual void NotifyFunctionValuesMBSizeModified() override final
         {
             if (!HasMBLayout())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation does not have an MBLayout.", NodeName().c_str(), OperationName().c_str());
-            if (GetNumRows() != Value().GetNumRows())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its row dimension %d changed by the reader to %d.", NodeName().c_str(), OperationName().c_str(), (int)GetNumRows(), (int)Value().GetNumRows());
+                LogicError("NotifyFunctionValuesMBSizeModified: Must only be called on nodes with MBLayout.");
+            if (GetSampleMatrixNumRows() != Value().GetNumRows())
+                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its row dimension %d changed by the reader to %d.", NodeName().c_str(), OperationName().c_str(), (int)GetSampleMatrixNumRows(), (int)Value().GetNumRows());
             if (GetMBLayout()->GetNumCols() != Value().GetNumCols())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its col dimension %d changed by the reader to %d, but different from MBLayout.", NodeName().c_str(), OperationName().c_str(), (int)GetNumCols(), (int)Value().GetNumCols());
-            m_numCols = Value().GetNumCols();
-            if (GetNumCols() != Value().GetNumCols())
-                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its col dimension %d changed by the reader to %d, MBLayout was not updated.", NodeName().c_str(), OperationName().c_str(), (int)GetNumCols(), (int)Value().GetNumCols());
+                LogicError("NotifyFunctionValuesMBSizeModified: %ls %ls operation had its col dimension %d changed by the reader to %d, but different from MBLayout.", NodeName().c_str(), OperationName().c_str(), (int)GetMBLayout()->GetNumCols(), (int)Value().GetNumCols());
+        }
+        virtual double Get00Element() const override final
+        {
+            // TODO: Are all these meant to read out a scalar? Then rename and verify dimensions.
+            return Value().Get00Element();
         }
-    virtual double Get00Element() const override final
-    {
-        return Value().Get00Element();
-    }
 
         // recover a shared_ptr from ourselves if given a naked pointer
         ComputationNodePtr shared_from_this()
@@ -1108,61 +1154,40 @@ template <class ElemType>
         /*HasToString::*/ wstring ToString() const
         {
             // we format it like "name : type rows x cols ( args )"
-        wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
-        result.append(msra::strfun::wstrprintf(L" %d x %d", (int) GetNumRows(), (int) GetNumCols()));
-        if (m_inputs.empty())
-            result.append(L" ()");
+            wstring result = /*TidyName*/ (NodeName()) + L" : " + OperationName();
+            result.append(msra::strfun::wstrprintf(L" [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""));
+            if (m_inputs.empty())
+                result.append(L" ()");
             else
             {
                 wstring args;
                 bool first = true;
-            for (auto& child : m_inputs)
+                for (auto& child : m_inputs)
                 {
                     if (first)
                         first = false;
                     else
                         args.append(L"\n");
-                args.append(/*TidyName*/ (child->NodeName()));
+                    args.append(/*TidyName*/ (child->NodeName()));
                 }
                 result += L" " + NestString(args, L'(', true, ')');
             }
             return result;
         }
 
-        // update size (m_numCols) of node to match MBLayout (but does not do the actual Resize())
-        // This must be called right before ForwardProp() the first time for a given minibatch.
-        // Currently overridden by
-        //  - InputValue, which verifies instead of resizing (since Resize() is specified to be destructive, it should not call it).
-        //  - LearnableParameters
-        //  - GMMLogLikelihoodNode (which allocates some internal temp memory).
-        // Note: This only updates the dimensions but does not actually allocate anything.
-        // The actual allocation happens later, in BeginForwardProp().
-        // TODO: How is this function different from BeginForwardProp()?  --> answer: it will be called from there some day
-        virtual void UpdateFunctionMBSize() override
-        {
-            // TODO: just remove this
-            if (m_pMBLayout)               // if no layout, this node contains parameters independent of MB size, don't resize
-                SetNumCols(m_pMBLayout->GetNumCols());
-        }
-        virtual void VerifyDimsMatch() const override final
+        // update temporary variables of a node to match MBLayout
+        virtual void UpdateFunctionMBSize() override { }
+        virtual void VerifyValueDims() const override final
         {
             if (!m_value)
                 return;
-        auto f_numRows = m_value->GetNumRows(); // variables for easy inspection in debugger
+            auto f_numRows = m_value->GetNumRows(); // variables for easy inspection in debugger
             auto f_numCols = m_value->GetNumCols();
-            if (f_numRows != GetNumRows() || f_numCols != GetNumCols())
-                LogicError("VerifyDimsMatch: m_value out of sync with GetNumRows()/GetNumCols()");
-
-#ifdef SHOW_MATRIX_TYPE
-            fprintf(stderr, "MatrixType %ls: %ls(%ls  %ls)\n",
-                NodeName().c_str(),
-                OperationName().c_str(),
-                Value().GetMatrixType() == MatrixType::DENSE ? L"Dense" : L"Sparse",
-                Value().GetCurrentMatrixLocation() == GPU ? L"GPU" : Value().GetCurrentMatrixLocation() == CPU ? L"CPU" : L"BOTH");
-#endif        
+            if (f_numRows != GetSampleMatrixNumRows() || f_numCols != GetSampleMatrixNumCols())
+                LogicError("VerifyValueDims: m_value out of sync with GetSampleMatrixNumRows()/GetSampleMatrixNumCols()");
         }
 
-        void ValidateInferInputDims(size_t i, size_t rows, size_t cols) override final;
+        void ValidateInferInputDimsFrom(const TensorShape & otherShape);
 
     public:
     static void MaskMissingColumnsToZero(Matrix<ElemType>& matrixToBeMasked, const MBLayoutPtr& pMBLayout, const FrameRange& fr)
@@ -1197,7 +1222,7 @@ template <class ElemType>
         // for debugging purposes
     void /*ComputationNodeBase::*/ PrintSelf(bool printMatrices = false) const
         {
-            fprintf(stderr, "\n%ls[%lu, %lu] = %ls", NodeName().c_str(), GetNumRows(), GetNumCols(), OperationName().c_str());           
+            fprintf(stderr, "\n%ls[%s%s] = %ls", NodeName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "", OperationName().c_str());
 
             if (!IsLeaf())
             {
@@ -1206,7 +1231,7 @@ template <class ElemType>
                 {
                     if (i > 0)
                         fprintf(stderr, ", ");           
-                fprintf(stderr, "%ls[%lu, %lu]", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", m_inputs[i]->GetNumRows(), m_inputs[i]->GetNumCols());
+                    fprintf(stderr, "%ls[%s%s] = %ls", m_inputs[i] ? m_inputs[i]->NodeName().c_str() : L"NULL", string(m_inputs[i]->GetSampleLayout()).c_str(), m_inputs[i]->HasMBLayout() ? " x *" : "", OperationName().c_str());
                 }
                 fprintf(stderr, ")");           
             }
@@ -1270,20 +1295,38 @@ template <class ElemType>
     {
         return *m_gradient;
     }
+private:
+    // map a tensor to a matrix
+    // Tensors are stored as column vectors. This function reshapes that vector into a Matrix (ref) object.
+    Matrix<ElemType> TensorAsMatrix(Matrix<ElemType> & data)
+    {
+        return data.Reshaped(GetAsMatrixNumRows(), GetAsMatrixNumCols());
+    }
+public:
+    Matrix<ElemType> ValueAsMatrix()
+    {
+        return TensorAsMatrix(*m_value);
+    }
+    Matrix<ElemType> GradientAsMatrix()
+    {
+        return TensorAsMatrix(*m_gradient);
+    }
 
     public:
+#if 0   // only used for old implementation of PlusNode
         // Function to return the number of columns for whole batch or single frame
     size_t GetNumColsFor(const FrameRange& fr /*select frame or entire batch*/)
         {
             try
             {
-                return ColumnRangeWithMBLayoutFor(GetNumCols(), fr, m_pMBLayout).second;
+                return ColumnRangeWithMBLayoutFor(Value().GetNumCols(), fr, m_pMBLayout).second;
             }
         catch (const logic_error& e) // catch the error and rethrow it with the node name attached
             {
                 LogicError("%s, for %ls %ls operation.", e.what(), NodeName().c_str(), OperationName().c_str());
             }
         }
+#endif
 
         // function to access any input and output, value and gradient, whole batch or single frame
         // Note: This returns a reference into 'data' in the form of a column slice, i.e. a small matrix object that just points into 'data'.
@@ -1342,26 +1385,26 @@ template <class ElemType>
         // update the actual matrix allocation for m_value based on the node dimension
         void UpdateFunctionValuesSize()
         {
-            Value().Resize(GetNumRows(), GetNumCols());
+            Value().Resize(GetSampleMatrixNumRows(), GetSampleMatrixNumCols());
         }
 
         // this is called before a node's ForwardProp() function is called (in loops: for the first time)
         // This is where we
         //  - update the node dimension based on actual MB size
         //  - (re-)allocate the m_value matrix, which may be shared across nodes and thus have changed dimensions
-    virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
+        virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
         {
             Base::BeginForwardProp();
 
-            // update m_numCols based on MB size
-            UpdateFunctionMBSize();
-
             // update the actual m_value allocation
-        if (!IsLeaf() && !RequiresPreCompute()) // TODO: guard this through overrides instead
+            if (!IsLeaf() && !RequiresPreCompute()) // TODO: guard this through overrides instead
                 UpdateFunctionValuesSize();
 
+            // give nodes a chance to update their internal state that may also have to match MB size
+            UpdateFunctionMBSize();
+
             // and make sure dimensions are what we expect
-            VerifyDimsMatch();
+            VerifyValueDims();
         }
 
 #ifdef _DEBUG
@@ -1464,7 +1507,7 @@ template <class ElemType>
             if (m_gradientInitialized)
                 return;
 
-            Gradient().Resize(GetNumRows(), GetNumCols());
+            Gradient().Resize(Value().GetNumRows(), Value().GetNumCols());
             Gradient().SetValue(0);
 
             m_gradientInitialized = true;
@@ -1526,7 +1569,7 @@ template <class ElemType>
             matrixPool.Release<ElemType>(matrixPtr);
         }
 
-        //to be called by derived classed if that class needs to print node values
+        // print node values
         void PrintNodeValuesToFile(const bool printValues, File& fstream) const
         {
             if (printValues)
@@ -1687,7 +1730,7 @@ template <class ElemType>
     {
         NOT_IMPLEMENTED;
     }
-    virtual void VerifyDimsMatch() const override
+    virtual void VerifyValueDims() const override
     {
         NOT_IMPLEMENTED;
     }
@@ -1699,7 +1742,7 @@ template <class ElemType>
     {
         NOT_IMPLEMENTED;
     }
-    virtual void ValidateInferInputDims(size_t, size_t, size_t) override
+    virtual void ValidateInferInputDimsFrom(const TensorShape &) override
     {
         NOT_IMPLEMENTED;
     }
@@ -1727,6 +1770,7 @@ template <class ElemType>
     {
         NOT_IMPLEMENTED;
     }
+    virtual void NotifyFunctionValuesMBSizeModified(void) override { NOT_IMPLEMENTED; }
     virtual std::wstring ToString(void) const override
     {
         NOT_IMPLEMENTED;
@@ -1792,18 +1836,18 @@ struct IRecurrentNode
     virtual int GetRecurrenceSteppingDirection() const = 0;
 };
 
-    // =======================================================================
-    // helper macro to ease access to base members in presence of C++ two-phase name lookup
-    // =======================================================================
-
-    // Add 'typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;' at the start of each derived class
-    // (some derived classes define a similar macro; there please modify the typedef for Base accordingly.)
-    // This macro imports, one by one, every member of ComputationNode into the name space of the derived class.
-    // Without this, one would have to use the name prefix, or alternatively this->, in front of all base member,
-    // because the standard does not allow the compiler to do that for you (as MSVC still kindly does).
-    // If you add new members to ComputationNode, please also add them here.
-    // This macro expects 'Base' to be the name of the base class. Please also use 'Base' outside this macro to make it less likely to accidentally call the wrong base class members.
-    // Note: Whoever invented that C++ insanity called two-phase name lookup shall rot in hell, for the crime of causing infinite pain on unsuspecting programmers. [fseide]
+// =======================================================================
+// helper macro to ease access to base members in presence of C++ two-phase name lookup
+// =======================================================================
+
+// Add 'typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;' at the start of each derived class
+// (some derived classes define a similar macro; there please modify the typedef for Base accordingly.)
+// This macro imports, one by one, every member of ComputationNode into the name space of the derived class.
+// Without this, one would have to use the name prefix, or alternatively this->, in front of all base member,
+// because the standard does not allow the compiler to do that for you (as MSVC still kindly does).
+// If you add new members to ComputationNode, please also add them here.
+// This macro expects 'Base' to be the name of the base class. Please also use 'Base' outside this macro to make it less likely to accidentally call the wrong base class members.
+// Note: Whoever invented that C++ insanity called two-phase name lookup shall rot in hell, for the crime of causing infinite pain on unsuspecting programmers. [fseide]
 #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */ \
     \
 protected:                                                                                                                                               \
@@ -1813,9 +1857,8 @@ protected:
     using Base::GetDeviceId;                                                                                                                             \
     using Base::SetDims;                                                                                                                                 \
     using Base::SetDims1;                                                                                                                                \
-    using Base::SetNumCols;                                                                                                                              \
-    using Base::GetNumRows;                                                                                                                              \
-    using Base::GetNumCols;                                                                                                                              \
+    using Base::GetSampleMatrixNumRows;                                                                                                                              \
+    using Base::GetSampleMatrixNumCols;                                                                                                                              \
     using Base::GetTensorShape;                                                                                                                          \
     using Base::UpdateFunctionValuesSize;                                                                                                                \
     using Base::LoadValue;                                                                                                                               \
@@ -1829,7 +1872,7 @@ protected:
     using Base::InvalidateMissingGradientColumns;                                                                                                        \
     using Base::DataFor;                                                                                                                                 \
     using Base::ValueFor;                                                                                                                                \
-    using Base::Gradient;                                                                                                                                \
+    using Base::GradientAsMatrix; using Base::Gradient;                                                                                                                                \
     using Base::GradientFor;                                                                                                                             \
     using Base::MaskedValueFor;                                                                                                                          \
     using Base::MaskedGradientFor;                                                                                                                       \
@@ -1881,21 +1924,21 @@ protected:
     using Base::RequestMatricesBeforeBackprop;                                                                                                           \
     using Base::ReleaseMatricesAfterBackprop;                                                                                                            \
     using Base::InputUsedInComputingInputNodesGradients;                                                                                                 \
-    using Base::OutputUsedInComputingInputNodesGradients;                                                                                                \
+    using Base::OutputUsedInComputingInputNodesGradients; using Base::m_valueSharable;                                                                                              \
     using Base::Validate;                                                                                                                                \
     using Base::ValidateUnaryMap;                                                                                                                        \
     using Base::ValidateBinaryZip;                                                                                                                       \
     using Base::ValidateUnaryReduce;                                                                                                                     \
     using Base::ValidateBinaryReduce;                                                                                                                    \
     using Base::ValidateInferBinaryInputDims;                                                                                                            \
-    using Base::ValidateInferInputDims;                                                                                                                  \
+    using Base::ValidateInferInputDimsFrom;                                                                                                                  \
     \
 public:                                                                                                                                                  \
     using Base::RequiresPreCompute;                                                                                                                      \
     using Base::AttachInputs;                                                                                                                            \
     using Base::CreateGradientMatrixIfNull;                                                                                                              \
     using Base::NodeName;                                                                                                                                \
-    using Base::Value;
+    using Base::ValueAsMatrix; using Base::Value;
 
 #define ComputationNodeBoilerplate                                                             \
     \
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index ce3c2eccf305..6687aa1dab4c 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -92,7 +92,7 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
           m_imageLayoutKind(imageLayoutKind)
     {
         SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+        m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
     }
     ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
         : ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
@@ -150,8 +150,8 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
 
     void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        auto sliceOutputGrad = GradientFor(fr);
+        auto sliceInput1Value = Input(1)->ValueFor(fr);
 
         size_t batchSize = sliceInput1Value.GetNumCols();
         m_inT->setN(batchSize);
@@ -159,13 +159,13 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
         assert(m_convEng != nullptr);
         if (inputIndex == 0) // derivative with respect to the weight matrix
         {
-            Matrix<ElemType>& grad = Input(0)->Gradient();
+            auto grad = Input(0)->GradientAsMatrix();
             m_convEng->BackwardFilter(*m_outT, sliceOutputGrad, *m_inT, sliceInput1Value, *m_convDesc, *m_filterT, grad, fr.IsAllFrames(), *m_tempMatrix);
         }
         else if (inputIndex == 1) // derivative with respect to the input feature
         {
-            const Matrix<ElemType>& input0 = Input(0)->Value();
-            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
+            auto input0 = Input(0)->ValueAsMatrix();
+            auto sliceInput1Grad = Input(1)->GradientFor(fr);
             m_convEng->BackwardData(*m_outT, sliceOutputGrad, *m_filterT, input0, *m_convDesc, *m_inT, sliceInput1Grad, *m_tempMatrix);
         }
     }
@@ -179,7 +179,7 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
 
     void ForwardProp(const FrameRange& fr) override
     {
-        const Matrix<ElemType>& input0 = Input(0)->Value();
+        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
         Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
@@ -234,22 +234,14 @@ class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
         size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
 
         // check/infer input [0] (weights)
-        if (Input(0)->Value().HasNoElements())
-            ValidateInferInputDims(0, m_outputChannels, weightCols);
+        // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
+        Input(0)->ValidateInferInputDimsFrom(TensorShape(m_outputChannels, weightCols));
 
-        if (isFinalValidationPass && (Input(0)->GetNumCols() != weightCols || Input(0)->GetNumRows() != m_outputChannels))
+        if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != m_outputChannels))
             LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int) m_outputChannels, (int) weightCols);
 
-        // check/infer input [1] (data)
-        size_t inputDim = inDims.m_width * inDims.m_height * inDims.m_numChannels;
-        if (Input(1)->GetNumRows() == 0)
-            ValidateInferInputDims(1, inputDim, Input(1)->GetNumCols());
-
-        if (isFinalValidationPass && Input(1)->GetNumRows() != inputDim)
-            LogicError("Each column of inDims to the convolution node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels.", NodeName().c_str(), (int) inputDim);
-
         // that's our dimension
-        SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(1)->GetNumCols());
+        SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
 
         if (isFinalValidationPass)
         {
@@ -366,7 +358,7 @@ class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
           m_verticalSubsample(verticalSubsample),
           m_imageLayoutKind(imageLayoutKind)
     {
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+        m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
     }
     PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
         : PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
@@ -461,13 +453,7 @@ class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
 
         m_inputSizePerSample = inDims.m_width * inDims.m_height * inDims.m_numChannels;
 
-        if (Input(0)->GetNumRows() == 0)
-            ValidateInferInputDims(0, m_inputSizePerSample, Input(0)->GetNumCols()); // TODO: We should infer a tensor dimension for the input instead.
-
-        if (isFinalValidationPass && Input(0)->GetNumRows() != m_inputSizePerSample) // TODO: Can be removed once tensor shape and numRows are perfectly in sync.
-            LogicError("each column of input to the MaxPooling node %ls is a sample and should have dimension %d, which is inputWidth * inputHeight * inputChannels", NodeName().c_str(), (int) m_inputSizePerSample);
-
-        SetDims(outDims.AsTensorShape(m_imageLayoutKind), Input(0)->GetNumCols());
+        SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
 
         if (isFinalValidationPass)
         {
@@ -612,6 +598,10 @@ class AveragePoolingNode : public PoolingNodeBase<ElemType>
 template class AveragePoolingNode<float>;
 template class AveragePoolingNode<double>;
 
+// -----------------------------------------------------------------------
+// BatchNormalizationNode (...)  --TODO: document inputs
+// -----------------------------------------------------------------------
+
 // Implements batch normalization technique as described in:
 // Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift [S. Ioffe, C. Szegedy]
 // http://arxiv.org/abs/1502.03167
@@ -703,8 +693,8 @@ class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInput
 
         if (inputIndex == 0) // derivative with respect to the input.
         {
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-            Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
+            auto sliceOutputGrad = GradientFor(fr);
+            auto sliceInputValue = Input(0)->ValueFor(fr);
             const Matrix<ElemType>& scale = Input(1)->Value();
             const Matrix<ElemType>& bias = Input(2)->Value();
 
@@ -712,9 +702,9 @@ class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInput
             m_inT->setN(batchSize);
             assert(m_convEng != nullptr);
 
-            Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
-            m_dScale->Resize(scale.GetNumRows(), scale.GetNumCols());
-            m_dBias->Resize(bias.GetNumRows(), bias.GetNumCols());
+            auto sliceInputGrad = Input(0)->GradientFor(fr);
+            m_dScale->Resize(scale);
+            m_dBias->Resize(bias);
             // Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
             m_convEng->BackwardNormalizeBatch(*m_inT, sliceInputValue, sliceOutputGrad, sliceInputGrad, *m_scaleBiasT, scale, m_spatial,
                                               *m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h
index a72f2f068b22..1ca60d180767 100644
--- a/Source/ComputationNetworkLib/EsotericNodes.h
+++ b/Source/ComputationNetworkLib/EsotericNodes.h
@@ -697,7 +697,7 @@ template <class ElemType>
             {
                 size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
                 size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
-                ValidateInferInputDims(index, rows, cols);
+                ValidateInferInputDimsFrom(index, rows, cols);
             }
 
             size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
@@ -1092,15 +1092,15 @@ class DummyCriterionNode : public ComputationNodeNonLooping /*ComputationNode*/<
                 LogicError("DummyCriterionNode criterion requires the first input to be computed derivatives.");
             if (isFinalValidationPass)
             {
-                if (Input(0)->GetNumRows() != 1)
-                LogicError("DummyCriterionNode criterion requires the first input to have dimension 1.");
-                if (Input(0)->GetNumRows() == 0 || Input(1)->GetNumRows() == 0 || Input(2)->GetNumRows() == 0)
+                if (Input(0)->GetSampleMatrixNumRows() != 1)
+                    LogicError("DummyCriterionNode criterion requires the first input to have dimension 1.");
+                if (Input(0)->GetSampleMatrixNumRows() == 0 || Input(1)->GetSampleMatrixNumRows() == 0 || Input(2)->GetSampleMatrixNumRows() == 0)
                     LogicError("DummyCriterionNode operation: one of the operands has 0 elements.");
-                if (Input(1)->GetNumRows() != Input(2)->GetNumRows())
+                if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetSampleMatrixNumRows())
                 LogicError("The Matrix dimension in the DummyCriterionNode operation does not match.");
             }
 
-            SetDims(TensorShape(1), 1);
+            SetDims(TensorShape(1), false);
         }
     };
 
@@ -1293,10 +1293,10 @@ class SequenceDecoderNode : public ComputationNodeNonLooping /*ComputationNode*/
             InferMBLayoutFromInputsForStandardCase();
 
             if (isFinalValidationPass)
-            if (!(Input(1)->GetNumRows() == Input(2)->GetNumRows() && // position dependent and pair scores have same number of labels
-                    Input(0)->GetNumRows() == Input(1)->GetNumRows() &&
-                    Input(0)->GetNumCols() == Input(1)->GetNumCols() && // position dependent and pair scores have the same observation numbers
-                    Input(2)->GetNumCols() == Input(2)->GetNumRows()))
+                if (!(Input(1)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() && // position dependent and pair scores have same number of labels
+                    Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() &&
+                    Input(0)->GetSampleMatrixNumCols() == Input(1)->GetSampleMatrixNumCols() && // position dependent and pair scores have the same observation numbers
+                    Input(2)->GetSampleMatrixNumCols() == Input(2)->GetSampleMatrixNumRows()))
                 {
                     LogicError("The Matrix<ElemType>  dimension in the SequenceDecoderNode operation does not match.");
                 }
@@ -1380,8 +1380,8 @@ template <class ElemType>
 
                     //BackpropToLeft1(sliceInput1Value, Input(0)->Gradient(), sliceOutputGrad);
 
-                    size_t r = Input(0)->GetNumRows();
-                size_t T1 = Input(0)->GetNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
+                    size_t r = Input(0)->GetSampleMatrixNumRows();
+                    size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
                     Matrix<ElemType> mTmp1(r, T1, sliceInput1Value.GetDeviceId());
 
                     // process sequence by sequence
@@ -1408,8 +1408,8 @@ template <class ElemType>
                     // process sequence by sequence
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
-                        size_t r = Input(0)->GetNumRows();
-                    size_t T1 = Input(0)->GetNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
+                        size_t r = Input(0)->GetSampleMatrixNumRows();
+                        size_t T1 = Input(0)->GetSampleMatrixNumCols() / GetNumParallelSequences(); // TODO: if T1 == GetNumTimeSteps() then we can simplify code below.
                         Matrix<ElemType> mTmp1(r, T1, sliceOutputGrad.GetDeviceId());
                         for (size_t t = 0; t < T1; t++)
                         {
@@ -1430,8 +1430,8 @@ template <class ElemType>
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
-                        size_t d = Input(1)->GetNumRows();
-                        size_t T1 = Input(0)->GetNumRows() / GetNumParallelSequences();
+                        size_t d = Input(1)->GetSampleMatrixNumRows();
+                        size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
                         Matrix<ElemType> mTmp1(sliceInput1Value.GetDeviceId());
                         mTmp1.Resize(d, T1);
                         Matrix<ElemType> mTmp2 = sliceInput1Value.ColumnSlice(k, 1);
@@ -1453,8 +1453,8 @@ template <class ElemType>
 
                     for (size_t k = 0; k < GetNumParallelSequences(); k++)
                     {
-                        size_t d = Input(1)->GetNumRows();
-                        size_t T1 = Input(0)->GetNumRows() / GetNumParallelSequences();
+                        size_t d = Input(1)->GetSampleMatrixNumRows();
+                        size_t T1 = Input(0)->GetSampleMatrixNumRows() / GetNumParallelSequences();
 
                         Matrix<ElemType> mTmp0(sliceOutputGrad.GetDeviceId());
                         mTmp0.Resize(1, d);
@@ -1532,14 +1532,14 @@ template <class ElemType>
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
         {
-            size_t rows0 = Input(0)->GetNumRows(), cols1 = Input(1)->GetNumCols();
+            size_t rows0 = Input(0)->GetSampleMatrixNumRows();
             Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
             UpdateStride(sliceInput1Value);
 
             if (m_strideDim == 0)
-                SetDims(TensorShape(rows0 / GetNumParallelSequences()), cols1);
+                SetDims(TensorShape(rows0 / GetNumParallelSequences()), HasMBLayout());
             else
-                SetDims(Input(0)->GetSampleLayout(), cols1);
+                SetDims(Input(0)->GetSampleLayout(), HasMBLayout());
 
             Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
@@ -1637,15 +1637,15 @@ template <class ElemType>
             //if (Input(2)->m_needGradient)        // disabled because this is a flag that belongs to Network. Node should simply not propagate anything into it
             //    RuntimeError("StrideTimes: No gradient update should be on input(2).");
 
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-            size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
+            size_t rows0 = Input(0)->GetSampleMatrixNumRows(), cols0 = Input(0)->GetSampleMatrixNumCols();
+            size_t rows1 = Input(1)->GetSampleMatrixNumRows();
 
             if (m_strideDim == 0) // by row
             {
                 if (isFinalValidationPass && rows1 != cols0)
                 RuntimeError("The Matrix dimension in the StrideTimes operation in dim %d does not match for cols %d in A and rows %d in B.", (int) m_strideDim, (int) cols0, (int) rows1);
                 size_t T1 = rows0 / m_stride;
-                SetDims(TensorShape(T1), cols1);
+                SetDims(TensorShape(T1), HasMBLayout());
                 //after multiplication the structure is lost
             }
 
@@ -1653,7 +1653,7 @@ template <class ElemType>
             {
                 if (isFinalValidationPass && cols0 != rows1 * m_stride)
                 RuntimeError("The Matrix dimension in the StrideTimes operation in dim %d does not match for cols %d in A and row number %d in B.", (int) m_strideDim, (int) cols0, (int) rows1);
-                SetDims(TensorShape(rows0), cols1);
+                SetDims(TensorShape(rows0), HasMBLayout());
                 //after multiplication the structure is lost
             }
         }
@@ -1680,10 +1680,10 @@ template <class ElemType>
         return L"PairNetwork";
     }
 
-        void Init(size_t row_size, size_t col_size)
+        void Init(size_t row_size, size_t /*col_size*/)
         {
             CreateMatrixIfNull(m_value);
-            SetDims(TensorShape(row_size), col_size);
+            SetDims(TensorShape(row_size), HasMBLayout());
             UpdateFunctionValuesSize();
         }
 
@@ -1720,7 +1720,7 @@ template <class ElemType>
             BackpropToMap(inputIndex);
             return;
         }                                                // TODO: remove these one by one
-            assert(GetNumRows() == Gradient().GetNumRows()); // original used m_value->GetNumRows() for loop dimension
+        assert(GetSampleMatrixNumRows() == Gradient().GetNumRows()); // original used m_value->GetNumRows() for loop dimension
             assert(m_pMBLayout);
 
             Matrix<ElemType> mTmp = Input(inputIndex)->GradientFor(fr);
@@ -1747,11 +1747,7 @@ template <class ElemType>
             Base::Validate(isFinalValidationPass);
             InferMBLayoutFromInputsForStandardCase();
 
-            size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-            if (rows0 > 0 && cols0 > 0) // TODO: is this check needed?
-                SetDims(Input(0));
-            else
-                SetDims(Input(0)->GetSampleLayout(), 0);
+            SetDims(Input(0));
         }
     };
 
@@ -1787,13 +1783,13 @@ template <class ElemType>
             if (inputIndex > 1)
                 InvalidArgument("Parallel operation only takes two input.");
             ComputationNodePtr child = Input(inputIndex);
-            size_t startidx = (inputIndex == 0) ? 0 : Input(0)->GetNumRows();
-            size_t nrows = child->GetNumRows();
+            size_t startidx = (inputIndex == 0) ? 0 : Input(0)->GetSampleMatrixNumRows();
+            size_t nrows = child->GetSampleMatrixNumRows();
 
             // TODO: why is this needed? If it is, it should be solved more centrally.
-            if (child->Gradient().GetNumRows() != child->GetNumRows() || child->Gradient().GetNumCols() != GetNumCols())
+            if (child->Gradient().GetNumRows() != child->GetSampleMatrixNumRows() || child->Gradient().GetNumCols() != GetSampleMatrixNumCols())
             {
-                child->Gradient().Resize(child->GetNumRows(), child->GetNumCols());
+                child->Gradient().Resize(child->GetSampleMatrixNumRows(), child->GetSampleMatrixNumCols());
                 child->Gradient().SetValue(0);
             }
 
@@ -1851,21 +1847,13 @@ template <class ElemType>
             Base::Validate(isFinalValidationPass);
             InferMBLayoutFromInputsForStandardCase();
 
-            size_t rows1, cols1;
-            rows1 = Input(1)->GetNumRows();
-            cols1 = Input(1)->GetNumCols();
+            size_t rows1 = Input(1)->GetSampleMatrixNumRows();
 
-            size_t rows0, cols0;
-            rows0 = Input(0)->GetNumRows();
-            cols0 = Input(0)->GetNumCols();
-
-            if (isFinalValidationPass && cols0 != cols1)
-                LogicError("ParallelNode: column dimension mismatched!");
+            size_t rows0 = Input(0)->GetSampleMatrixNumRows();
 
             size_t rows = rows0 + rows1;
-            size_t cols = cols0;
 
-            SetDims(TensorShape(rows), cols);
+            SetDims(TensorShape(rows), HasMBLayout());
             m_sampleLayout = GetInputSampleLayout(0);
             // BUGBUG: Inconsistent with 'rows'
         }
@@ -2072,14 +2060,14 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
             if (inputIndex > 4)
                 InvalidArgument("LSTM operation only takes five inputs.");
 
-            size_t nT = Input(0)->GetNumCols();
-            size_t inputDim = Input(0)->GetNumRows();
-            size_t outputDim = Input(1)->GetNumRows();
+            size_t nT = Input(0)->GetSampleMatrixNumCols();
+            size_t inputDim = Input(0)->GetSampleMatrixNumRows();
+            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
 
             if (m_GradientComputed == false)
             {
-                if (GetNumCols() != Gradient().GetNumCols() ||
-                    GetNumRows() != Gradient().GetNumRows())
+                if (GetSampleMatrixNumCols() != Gradient().GetNumCols() ||
+                    GetSampleMatrixNumRows() != Gradient().GetNumRows())
                 {
                     RuntimeError("LSTMNode::GradientValue size doesn't match to the function value size");
                 }
@@ -2087,13 +2075,13 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
                 // reset gradients
             grdToObs.Resize(inputDim, nT);
             grdToObs.SetValue(0);
-            grdToInputGate.Resize(Input(1)->GetNumRows(), Input(1)->GetNumCols());
+            grdToInputGate.Resize(Input(1)->GetSampleMatrixNumRows(), Input(1)->GetSampleMatrixNumCols());
             grdToInputGate.SetValue(0);
-            grdToForgetGate.Resize(Input(2)->GetNumRows(), Input(2)->GetNumCols());
+            grdToForgetGate.Resize(Input(2)->GetSampleMatrixNumRows(), Input(2)->GetSampleMatrixNumCols());
             grdToForgetGate.SetValue(0);
-            grdToOutputGate.Resize(Input(3)->GetNumRows(), Input(3)->GetNumCols());
+            grdToOutputGate.Resize(Input(3)->GetSampleMatrixNumRows(), Input(3)->GetSampleMatrixNumCols());
             grdToOutputGate.SetValue(0);
-            grdToCellWgt.Resize(Input(4)->GetNumRows(), Input(4)->GetNumCols());
+            grdToCellWgt.Resize(Input(4)->GetSampleMatrixNumRows(), Input(4)->GetSampleMatrixNumCols());
             grdToCellWgt.SetValue(0);
 
                 Matrix<ElemType> slicePrevOutput(m_deviceId), slicePrevState(m_deviceId);
@@ -2422,7 +2410,7 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
             LogicError("GetSegInfo: stream id %d is larger than the number of streams %d", (int) streamid, (int) GetNumParallelSequences());
 
             Matrix<float> thisCol; // BUGBUG: These flags no longer exist. This code is no longer functional.
-            //size_t nT = Input(0)->GetNumCols();
+            //size_t nT = Input(0)->GetSampleMatrixNumCols();
             //if (t >= nT)
             //    LogicError("GetSegInfo: time %d times is larger than the total number of observations %d", (int)t, (int)nT);
             //int utt_t = (int)t / GetNumParallelSequences();
@@ -2436,8 +2424,8 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         */
         void SaveLastStateActity()
         {
-            size_t nT = Input(0)->GetNumCols();
-            size_t outputDim = Input(1)->GetNumRows();
+            size_t nT = Input(0)->GetSampleMatrixNumCols();
+            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
             
             // save the hidden activities and output for the next minibatch
             mLastOutput.Resize(outputDim, GetNumParallelSequences());
@@ -2459,8 +2447,8 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
         {
-            size_t nT = Input(0)->GetNumCols();
-            size_t outputDim = Input(1)->GetNumRows();
+            size_t nT = Input(0)->GetSampleMatrixNumCols();
+            size_t outputDim = Input(1)->GetSampleMatrixNumRows();
 
             {
                 SetDims1(outputDim, nT);
@@ -2799,44 +2787,44 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
                 LogicError("LSTM validation: need to have learnable parameters ");
 #endif
 
-            //if (Input(0)->GetNumRows() == 0)
+            //if (Input(0)->GetSampleMatrixNumRows() == 0)
             //    LogicError("LSTM validation: input size is zero!");
 
-            //if (Input(1)->GetNumRows() == 0 ||
-            //    Input(2)->GetNumRows() == 0 ||
-            //    Input(3)->GetNumRows() == 0 ||
-            //    Input(4)->GetNumRows() == 0)
+            //if (Input(1)->GetSampleMatrixNumRows() == 0 ||
+            //    Input(2)->GetSampleMatrixNumRows() == 0 ||
+            //    Input(3)->GetSampleMatrixNumRows() == 0 ||
+            //    Input(4)->GetSampleMatrixNumRows() == 0)
             //    LogicError("LSTM validation : parameter size is zero!");
 
-            size_t nindim = Input(0)->GetNumRows();
-            size_t noutdim = Input(1)->GetNumRows();
-            size_t nT = Input(0)->GetNumCols();
+            size_t nindim = Input(0)->GetSampleMatrixNumRows();
+            size_t noutdim = Input(1)->GetSampleMatrixNumRows();
+            //size_t nT = Input(0)->GetSampleMatrixNumCols();
             size_t nCol = nindim + noutdim + 2;
             if (isFinalValidationPass)
             {
-                if (Input(1)->GetNumCols() != nCol)
+                if (Input(1)->GetSampleMatrixNumCols() != nCol)
                 {
                     LogicError("LSTM validation : dimension mismatched between child and inputGate");
                 }
-                if (Input(2)->GetNumCols() != nCol)
+                if (Input(2)->GetSampleMatrixNumCols() != nCol)
                 {
                     LogicError("LSTM validation : dimension mismatched between child and forgetGate");
                 }
-                if (Input(3)->GetNumCols() != nCol)
+                if (Input(3)->GetSampleMatrixNumCols() != nCol)
                 {
                     LogicError("LSTM validation : dimension mismatched between child and outputGate");
                 }
 
-                if (noutdim != Input(2)->GetNumRows() ||
-                    noutdim != Input(3)->GetNumRows() ||
-                    noutdim != Input(4)->GetNumRows())
+                if (noutdim != Input(2)->GetSampleMatrixNumRows() ||
+                    noutdim != Input(3)->GetSampleMatrixNumRows() ||
+                    noutdim != Input(4)->GetSampleMatrixNumRows())
                 {
                     LogicError("LSTM validation: output dimension mismatched!");
                 }
             }
 
-            SetDims(TensorShape(noutdim), nT);
-        Value().SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
+            SetDims(TensorShape(noutdim), true);
+            Value().SetValue(NAN); // set to this extrem value so, if anything wrong in later procedure, problems can be easily spotted.
         }
 
         bool UnitTest()
@@ -2901,7 +2889,7 @@ class LSTMNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
                 Gradient().SetValue(1.0);
                 for (size_t i = 0; i < 5; i++)
                 {
-                    Input(i)->Gradient().Resize(Input(i)->GetNumRows(), Input(i)->GetNumCols());
+                    Input(i)->Gradient().Resize(Input(i)->GetSampleMatrixNumRows(), Input(i)->GetSampleMatrixNumCols());
                     Input(i)->Gradient().SetValue(0);
                 }
                 for (size_t i = 0; i < 5; i++)
diff --git a/Source/ComputationNetworkLib/EvaluationCriterionNodes.h b/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
index 1f4755524ec1..6f91e125293c 100644
--- a/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationCriterionNodes.h
@@ -75,8 +75,8 @@ class ErrorPredictionNode : public ComputationNodeNonLooping /*ComputationNode*/
         // TODO: Make topK a constructor parameter
         if (m_inputs.size() == 3)
         {
-            if (Input(2)->GetNumRows() != 1 || Input(2)->GetNumCols() != 1)
-                throw std::logic_error("TopK in ErrorPredictionNode must be a scalar value.");
+            if (Input(2)->GetSampleLayout().GetNumElements() != 1)
+                InvalidArgument("%ls %ls operation requires TopK to be a scalar value.", NodeName().c_str(), OperationName().c_str());
             m_topK = static_cast<int>(Input(2)->Get00Element());
         }
     }
@@ -86,7 +86,7 @@ class ErrorPredictionNode : public ComputationNodeNonLooping /*ComputationNode*/
         Base::UpdateFunctionMBSize();
 
         // resize the temporaries to their proper size
-        size_t cols = Input(0)->GetNumCols();
+        size_t cols = Input(0)->Value().GetNumCols();
         m_maxIndexes0->Resize(m_topK, cols);
         m_maxIndexes1->Resize(m_topK, cols);
         m_maxValues->Resize(m_topK, cols);
@@ -130,4 +130,5 @@ class ErrorPredictionNode : public ComputationNodeNonLooping /*ComputationNode*/
 
 template class ErrorPredictionNode<float>;
 template class ErrorPredictionNode<double>;
+
 } } }
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index b89b142d7664..8643769011dc 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -46,24 +46,14 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
     {
         m_parameterUpdateRequired = true;
         this->m_valueSharable = false;
-        SetDims(TensorShape(), 0);
     }
     LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape)
         : Base(deviceId, name)
     {
         m_parameterUpdateRequired = true;
         CreateMatrixIfNull(m_value);
-        this->m_valueSharable = false;
-        // for now we split off the trailing dimension into the matrix column dimension
-        // TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
-        auto dims = shape.GetDims();
-        size_t cols = 1;
-        if (dims.size() > 1)
-        {
-            cols = dims.back();
-            dims.resize(dims.size() - 1);
-        }
-        SetDims(TensorShape(dims), cols);
+        m_valueSharable = false;
+        SetDims(shape, false);
         UpdateFunctionValuesSize(); // this allocates the matrix
         Value().SetValue(0);
     }
@@ -104,7 +94,7 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
     {
         Base::Save(fstream);
         fstream << m_parameterUpdateRequired;
-        fstream << (size_t) 0 /*#rows in a legacy file format*/ << GetNumCols();
+        fstream << (size_t) 0 /*#rows in a legacy file format*/ << (size_t) 0 /*#cols in a legacy file format*/;
         m_sampleLayout.Save(fstream);
         fstream << Value();
     }
@@ -119,11 +109,15 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
 
         TensorShape sampleLayout;
         if (rows != 0) // legacy file format
-            sampleLayout = TensorShape(rows);
+            sampleLayout = TensorShape(rows, cols);
         else
+        {
             sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
+            if (cols > 1)   // in some legacy format, last tensor dimension was split off as an explicit column dimension
+                sampleLayout.AppendInPlace(sampleLayout.GetRank(), cols);
+        }
         LoadValue(fstream);
-        SetDims(sampleLayout, cols); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
+        SetDims(sampleLayout, false);  // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
     }
 
     // initialize with random numbers
@@ -132,7 +126,6 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
                     const ElemType initValueScale,
                     bool initOnCPUOnly) // if true then always init on CPU, making initialization consistent across both (for testing)
     {
-        size_t inputSize = GetNumCols();
         //fprintf(stderr, "%d x %d: %d  %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());
 
         // the random seed offset is set via the "randomSeedOffset" parameter in config
@@ -140,12 +133,13 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
             m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
         if (uniformInit)
         {
-            // TODO: move these crazy extra factors out from here and into NDL, and make them visible in BS
+            // TODO: move these hidden extra factors out from here and into NDL, and make them visible in BS
             ElemType randRange = 0.05f * initValueScale;
             Value().SetUniformRandomValue(-randRange, randRange, randomSeed);
         }
         else
         {
+            size_t inputSize = GetAsMatrixNumCols();
             ElemType randInitstd = 0.2f * initValueScale / sqrt(ElemType(inputSize));
             Value().SetGaussianRandomValue(0, randInitstd, randomSeed);
         }
@@ -162,6 +156,7 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
         Value().SetValue(numRows, numCols, m_deviceId, array.data(), matrixFlagNormal);
     }
 
+    // TODO: share code with InitFromFile()
     void ReviseFromFile(const std::wstring& reviseFromFilePath)
     {
         size_t numRows = 0;
@@ -202,7 +197,7 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
         Base::DumpNodeInfo(printValues, fstream);
 
         char str[4096];
-        sprintf(str, "[%lu,%lu]  ", GetNumRows(), GetNumCols());
+        sprintf(str, "[%lu,%lu]  ", GetAsMatrixNumRows(), GetAsMatrixNumCols());
         fstream << string(str);
         sprintf(str, "NeedGradient=%s", m_parameterUpdateRequired ? "true" : "false"); // TODO: update NDL to accept a better matching name as well
         fstream << string(str);
@@ -211,46 +206,6 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
     }
 };
 
-#if 0
-    // -----------------------------------------------------------------------
-    // SparseLearnableParameter (/*no input*/)
-    // -----------------------------------------------------------------------
-
-    // WARNING: Don't use SparseLearnableParameter yet since the current version assumes the parameter is dense instead of sparse
-    // WARNING: After the right implementation is put here we need to turn it on in NetworkDescriptionLangauge.cpp
-    template<class ElemType>
-    class SparseLearnableParameter : public LearnableParameter<ElemType>
-    {
-        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-        static const std::wstring TypeName() { return L"SparseLearnableParameter"; }
-    public:
-        DeclareConstructorFromConfigWithNumInputs(SparseLearnableParameter);
-        SparseLearnableParameter(DEVICEID_TYPE deviceId, const wstring & name) :
-            LearnableParameter<ElemType>(deviceId, name)
-        {
-            CreateMatrixIfNull(m_gradient);
-            m_gradient->SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol, false);
-        }
-        SparseLearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols, size_t size) :
-            LearnableParameter<ElemType>(deviceId, name, rows, cols)
-        {
-            CreateMatrixIfNull(m_gradient);
-            m_gradient->SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol, false);
-            m_gradient->Resize(rows, cols, size);
-        }
-
-        virtual void Load(File& fstream, size_t modelVersion) override
-        {
-            LearnableParameter<ElemType>::Load(fstream, modelVersion);
-            CreateMatrixIfNull(m_gradient);
-            m_gradient->Resize(GetNumRows(), GetNumCols());
-        }
-    };
-
-    template class SparseLearnableParameter<float>; 
-    template class SparseLearnableParameter<double>;
-#endif
-
 // -----------------------------------------------------------------------
 // InputValueBase (/*no input*/)
 // Base class for InputValue and SparseInputValue (typically fed by a DataReader)
@@ -306,9 +261,9 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
     virtual void Save(File& fstream) const override
     {
         Base::Save(fstream);
-        size_t rows = GetNumRows(); // using explicitly typed variables to be 100% symmetrical to Load()
-        size_t colsDummy = 0;       // This should not be saved. InputValues always are minibatches.
-        fstream << rows << colsDummy;
+        size_t rowsDummy = 0;               // compat with old file format
+        size_t colsDummy = 0;
+        fstream << rowsDummy << colsDummy;
         m_sampleLayout.Save(fstream);
     }
 
@@ -321,7 +276,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
         TensorShape sampleLayout;
         sampleLayout.Load(fstream, /*acceptLegacyFormat=*/true);
         // some older files may have inconsistent tensor information
-        if (rows != sampleLayout.GetNumElements())
+        if (rows != 0/*old file*/ && rows != sampleLayout.GetNumElements()/*even older file*/)
         {
             fprintf(stderr, "WARNING: %ls InputValue has inconsistent serialized sample layout %s vs. number of rows %d. Resetting sample layout to vector.\n",
                     NodeName().c_str(), string(sampleLayout).c_str(), (int) rows);
@@ -333,8 +288,10 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
     // InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
     virtual void UpdateFunctionMBSize() override
     {
-        if (!m_pMBLayout) // if no layout, this node contains parameters independent of MB size, don't resize
-            VerifyDims(GetNumRows(), m_pMBLayout->GetNumCols());
+        // don't touch our values
+        // But take the opportunity for an additional check. Why not.
+        if (Value().GetNumRows() != GetSampleLayout().GetNumElements())
+            LogicError("UpdateFunctionMBSize: m_value not matching m_sampleLayout");
     }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override
@@ -342,15 +299,13 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
     }
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&)
     {
+        LogicError("InputValueBase::BackpropTo() should never be called.");
     }
 
     virtual void DumpNodeInfo(const bool printValues, File& fstream) const override
     {
         Base::DumpNodeInfo(printValues, fstream);
-
-        char str[4096];
-        sprintf(str, "[%lu,%lu]", GetNumRows(), GetNumCols());
-        fstream << string(str); // TODO: string(.) necessary?
+        fstream << "[" << string(GetSampleLayout()) << "]";
     }
 
 private:
@@ -468,18 +423,18 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
             Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(t);
             Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(t);
 
-            BackpropToLeft(sliceInput1Value, Input(0)->Gradient(), sliceOutputGrad);
+            BackpropToLeft(sliceInput1Value, Input(0)->GradientAsMatrix(), sliceOutputGrad);
         }
         else if (inputIndex == 1) // right derivative (input)
         {
             Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(t);
             Matrix<ElemType> sliceOutputGrad = GradientFor(t);
 
-            BackpropToRight(Input(0)->Value(), sliceInput1Grad, sliceOutputGrad);
+            BackpropToRight(Input(0)->ValueAsMatrix(), sliceInput1Grad, sliceOutputGrad);
         }
     }
 
-    /*TODO: merge with call site*/ void BackpropToLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, Matrix<ElemType>& gradientValues)
+    /*TODO: merge with call site*/ void BackpropToLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>/*&*/ inputGradientValues, Matrix<ElemType>& gradientValues)
     {
         size_t rows1 = inputFunctionValues.GetNumRows(), cols1 = inputFunctionValues.GetNumCols();
         size_t rowsp = gradientValues.GetNumRows(), colsp = gradientValues.GetNumCols();
@@ -494,7 +449,7 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
         gradientValues.Reshape(rowsp, colsp);
     }
 
-    /*TODO: merge with call site*/ void BackpropToRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, Matrix<ElemType>& gradientValues)
+    /*TODO: merge with call site*/ void BackpropToRight(Matrix<ElemType>/*&*/ inputFunctionValues, Matrix<ElemType>& inputGradientValues, Matrix<ElemType>& gradientValues)
     {
         size_t rows1 = inputGradientValues.GetNumRows(), cols1 = inputGradientValues.GetNumCols();
         size_t rowsp = gradientValues.GetNumRows(), colsp = gradientValues.GetNumCols();
@@ -513,17 +468,17 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
     {
         // input0 is the weight (each column is an embedding of one word), input 1 contains m_bnrLooked words in each column (sample)
         Matrix<ElemType> functionValues = ValueFor(t);
-        const Matrix<ElemType>& input0 = Input(0)->Value();
+        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
         Matrix<ElemType> input1 = Input(1)->ValueFor(t);
 
         size_t rows1 = input1.GetNumRows(), cols1 = input1.GetNumCols();
         size_t cols0 = input0.GetNumCols();
 
-        if (rows1 % cols0 != 0)
-            LogicError("LookupTableNode: rows of input 1 and cols of input 0 are not modular. e.g., rows1 = 0.9 cols and this is not allowed. Check feature reader and network definition. This usually happens when the feature dimension is not specified as that in the network definition of look-up-table dimension size.");
-
         int wordsInEachSample = rows1 / cols0;
 
+        if (cols0 * wordsInEachSample != rows1)
+            LogicError("LookupTableNode: rows of input 1 is not a multiple of cols of input 0. This usually happens when the feature dimension is not specified as that in the network definition of look-up-table dimension size.");
+
         auto input1Reshaped = input1.Reshaped(rows1 / wordsInEachSample, cols1 * wordsInEachSample);
 
         auto functionValuesReshaped = functionValues.Reshaped(input0.GetNumRows(), input1Reshaped.GetNumCols());
@@ -535,13 +490,15 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        if (isFinalValidationPass && Input(1)->GetNumRows() % Input(0)->GetNumCols() != 0)
+        if (isFinalValidationPass && !HasMBLayout())
+            InvalidArgument("%ls %ls operation can only operate on minibatches.", NodeName().c_str(), OperationName().c_str());
+        if (isFinalValidationPass && Input(1)->GetAsMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0)
             InvalidArgument("Mismatched dimension. Rows in input1 must be multiples of cols in input0.");
 
-        int wordsInEachSample = Input(1)->GetNumRows() / Input(0)->GetNumCols();
+        int wordsInEachSample = Input(1)->GetAsMatrixNumRows() / Input(0)->GetAsMatrixNumCols();
 
         // TODO: Should this add a tensor dimension?
-        SetDims(TensorShape(Input(0)->GetNumRows() * wordsInEachSample), Input(1)->GetNumCols());
+        SetDims(TensorShape(Input(0)->GetSampleMatrixNumRows() * wordsInEachSample), true);
     }
 
     bool UnitTest()
@@ -582,7 +539,7 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
             Gradient().SetValue(1.0);
             for (size_t i = 0; i < 2; i++)
             {
-                Input(i)->Gradient().Resize(Input(i)->GetNumRows(), Input(i)->GetNumCols());
+                Input(i)->Gradient().Resize(Input(i)->Value().GetNumRows(), Input(i)->Value().GetNumCols());
                 Input(i)->Gradient().SetValue(0);
             }
             for (size_t i = 0; i < 2; i++)
diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
index db993afc8b26..5e2be69dd4fb 100644
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@@ -18,6 +18,7 @@
 #include <list>
 #include <memory>
 #include <algorithm>
+#include <utility>
 #include <assert.h>
 #include <atomic>
 #include <sstream>
@@ -56,7 +57,7 @@ class PlusNode : public BinaryElementWiseNode<ElemType>
         auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
 
         // if reduction then mask the respective input(s) (zero out the gaps)
-        if (Input(inputIndex)->GetNumCols() < GetNumCols())
+        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
             MaskMissingGradientColumnsToZero(fr);
 
         inputGradient.AddCopyOf(gradient);
@@ -105,7 +106,7 @@ class MinusNode : public BinaryElementWiseNode<ElemType>
         auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
 
         // if reduction then mask the respective input(s) (zero out the gaps)
-        if (Input(inputIndex)->GetNumCols() < GetNumCols())
+        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
             MaskMissingGradientColumnsToZero(fr);
 
         inputGradient.AddCopyOf(gradient, sign);
@@ -184,47 +185,48 @@ template class NegateNode<float>;
 template class NegateNode<double>;
 
 // -----------------------------------------------------------------------
-// TimesNode (A, B)
+// TimesNodeBase (A, B)
+// shared code of TimesNode and TransposeTimesNode (which transposes A)
 // right operand and output can have MB layout, while left operand cannot
 // -----------------------------------------------------------------------
 
-template <class ElemType>
-class TimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+template <class ElemType, bool m_transpose>
+class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
 {
     typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"Times";
-    }
+    UsingComputationNodeMembers;
 
 public:
-    DeclareConstructorFromConfigWithNumInputs(TimesNode);
-    TimesNode(DEVICEID_TYPE deviceId, const wstring& name)
+    TimesNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        if (inputIndex == 0) // left derivative
+        if (inputIndex == 0)        // left derivative
         {
             // this potentially computes inner products over time, so we use the Masked- variants
-            Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(fr);
-            Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(fr);
+            auto sliceOutputGrad = MaskedGradientFor(fr);
+            auto sliceInput1Value = Input(1)->MaskedValueFor(fr);
+            auto input0Grad = Input(0)->GradientAsMatrix();
 
             // currently we only support one combination when the input is sparse.
             if (sliceInput1Value.GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && sliceOutputGrad.GetMatrixType() == DENSE)
                 Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
 
-            Matrix<ElemType>::MultiplyAndAdd(sliceOutputGrad, false, sliceInput1Value, true, Input(0)->Gradient());
+            bool transpose = m_transpose;   // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
+            if (!transpose)
+                Matrix<ElemType>::MultiplyAndAdd(sliceOutputGrad, false, sliceInput1Value, true, input0Grad);
+            else
+                Matrix<ElemType>::MultiplyAndAdd(sliceInput1Value, false, sliceOutputGrad, true, input0Grad);
         }
-        else // right derivative
+        else                        // right derivative
         {
-            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+            auto sliceInput1Grad = Input(1)->GradientFor(fr);
+            auto sliceOutputGrad = GradientFor(fr);
 
-            Matrix<ElemType>::MultiplyAndAdd(Input(0)->Value(), true, sliceOutputGrad, false, sliceInput1Grad);
+            Matrix<ElemType>::MultiplyAndAdd(Input(0)->ValueAsMatrix(), !m_transpose, sliceOutputGrad, false, sliceInput1Grad);
         }
     }
 
@@ -237,16 +239,14 @@ class TimesNode : public ComputationNode<ElemType>, public NumInputs<2>
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        size_t rows0 = Input(0)->GetNumRows(), cols1 = Input(1)->GetNumCols();
-        VerifyDims(rows0, cols1);
-
         // right operand and output can have MB layout, while left operand cannot
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        auto sliceInput1Value = Input(1)->ValueFor(fr);
+        auto sliceOutputValue = ValueFor(fr);
 #if DUMPOUTPUT
-        Input(0)->Value().Print("TimesNode - Input0");
+        Input(0)->ValueAsMatrix().Print("TimesNode - Input0");
 #endif
-        sliceOutputValue.AssignProductOf(Input(0)->Value(), false, sliceInput1Value, false);
+        // BUGBUG: This uses incorrect Matrix dimensions when multiplying with a non-minibatch. To be fixed when we allow to apply TimesNode to a subset of tensor dimensions.
+        sliceOutputValue.AssignProductOf(Input(0)->ValueAsMatrix(), m_transpose, sliceInput1Value, false);
 #if NANCHECK
         sliceOutputValue.HasNan("Times");
 #endif
@@ -259,39 +259,47 @@ class TimesNode : public ComputationNode<ElemType>, public NumInputs<2>
     {
         Base::Validate(isFinalValidationPass);
         if (isFinalValidationPass && Input(0)->HasMBLayout())
-            InvalidArgument("%ls %ls operation requires the first factor to not be minibatch data (must not have an MBLayout).", NodeName().c_str(), OperationName().c_str());
+            InvalidArgument("%ls Times operation requires the first factor to not be minibatch data (must not have an MBLayout).", NodeName().c_str());
         InferMBLayoutFromInputsForStandardCase();
 
-        //support automatic dimension inference for learnable parameters
-        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-        size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
-
-        if (isFinalValidationPass && (rows0 == 0 || (cols1 == 0 && !Input(1)->GetMBLayout())))
-            RuntimeError("Times operation: Input(0)->GetNumRows() and Input(1)->GetNumCols() should not be 0 since it cannot be automatically inferred");
+        // support automatic dimension inference for learnable parameters
+        size_t rows0 = Input(0)->GetAsMatrixNumRows(), cols0 = Input(0)->GetAsMatrixNumCols();
+        bool transpose = m_transpose;   // (assigning to a non-const variable avoids a compiler warning C4127: conditional expression is constant)
+        if (transpose)
+            std::swap(rows0, cols0);
+        size_t rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
 
         // limited automatic dimension inference for *children*, useful for CNN since it can be hard to know the size of each input parameter without deep knowledge how CNN is implemented (padding, stride)
-        // TODO: ^^ There must be a better solution. Maybe MBLayout as well?
-        // TODO: use dynamic_pointer_cast
         // infer cols0 as rows1
-        if (cols0 == 0 && !Input(0)->GetMBLayout() && rows1 != 0 && isFinalValidationPass)
-            ValidateInferInputDims(0, rows0, rows1);
-
-        // infer rows1 as cols0
-        if (cols0 != 0 && rows1 == 0)
-            ValidateInferInputDims(1, cols0, cols1);
-
-        if (isFinalValidationPass && Input(1)->GetNumRows() != Input(0)->GetNumCols())
-            LogicError("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int) Input(1)->GetNumRows(), (int) Input(0)->GetNumCols());
+        Input(0)->ValidateInferInputDimsFrom(m_transpose ? TensorShape(rows1, rows0) : TensorShape(rows0, rows1));
 
         // TODO: With tensors, inner dimensions must match.
-        // after multiplication the structure is lost
-        SetDims(TensorShape(rows0), cols1);
+        // after multiplication the tensor structure is lost
+        if (Input(1)->HasMBLayout())
+        {
+            // infer rows1 as cols0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(cols0));
+            SetDims(TensorShape(rows0), true);
+        }
+        else    // multiplying two straight matrices
+        {
+            size_t cols1 = Input(1)->GetAsMatrixNumCols();
+            // infer rows1 as cols0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(cols0, cols1));
+            SetDims(TensorShape(rows0, cols1), false);
+        }
+
+        // update after inference
+        cols0 = m_transpose ? Input(0)->GetAsMatrixNumRows() : Input(0)->GetAsMatrixNumCols();
+        rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
+        if (isFinalValidationPass && cols0 != rows1)
+            InvalidArgument("The inner matrix dimension in the %ls Times operation does not match (%d vs. %d).", NodeName().c_str(), (int)rows1, (int)cols0);
     }
 
     virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool) override
     {
         // this is a special handling case. We need to allocate sparse matrix directly instead of from pool.
-        if (m_inputs[0]->NeedGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
+        if (Input(0)->NeedGradient() && Input(1)->Value().GetMatrixType() == SPARSE)
         {
             Input(0)->CreateGradientMatrixIfNull();
             Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
@@ -303,123 +311,51 @@ class TimesNode : public ComputationNode<ElemType>, public NumInputs<2>
     }
 };
 
-template class TimesNode<float>;
-template class TimesNode<double>;
-
 // -----------------------------------------------------------------------
-// TransposeTimesNode (A', B)
+// TimesNode (A, B)
 // right operand and output can have MB layout, while left operand cannot
-// TODO: merge with TimesNode?
 // -----------------------------------------------------------------------
 
 template <class ElemType>
-class TransposeTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+class TimesNode : public TimesNodeBase<ElemType, false>
 {
-    typedef ComputationNode<ElemType> Base;
+    typedef TimesNodeBase<ElemType, false> Base;
     UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName()
     {
-        return L"TransposeTimes";
+        return L"Times";
     }
-
 public:
-    DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
-    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
+    DeclareConstructorFromConfigWithNumInputs(TimesNode);
+    TimesNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
     }
+};
 
-    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-    {
-        if (inputIndex == 0) //left derivative
-        {
-            // this potentially computes inner products over time, so we use the Masked- variants
-            Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(fr);
-            Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(fr);
-
-            BackpropToLeft(sliceInput1Value, Input(0)->Gradient(), sliceOutputGrad);
-        }
-        else //right derivative
-        {
-            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-
-            BackpropToRight(Input(0)->Value(), sliceInput1Grad, sliceOutputGrad);
-        }
-    }
-
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The TransposeTimesNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    /*TODO: merge with call site*/ void BackpropToLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
-    {
-#if DUMPOUTPUT
-        gradientValues.Print("Gradient-in");
-        inputGradientValues.Print("child Gradient-in/out");
-        inputFunctionValues.Print("child Function values");
-#endif
-        //currently we only support one combination when the input is sparse.
-        if (inputFunctionValues.GetMatrixType() == SPARSE && inputGradientValues.GetMatrixType() == DENSE && gradientValues.GetMatrixType() == DENSE)
-            inputGradientValues.SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
-
-        Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, true, inputGradientValues);
-
-#if DUMPOUTPUT
-        inputGradientValues.Print("child Gradient-out");
-#endif
-    }
-
-    /*TODO: merge with call site*/ void BackpropToRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)
-    {
-#if DUMPOUTPUT
-        gradientValues.Print("Gradient-in");
-        inputGradientValues.Print("child Gradient-in/out");
-        inputFunctionValues.Print("child Function values");
-#endif
-        Matrix<ElemType>::MultiplyAndAdd(inputFunctionValues, false, gradientValues, false, inputGradientValues);
+template class TimesNode<float>;
+template class TimesNode<double>;
 
-#if DUMPOUTPUT
-        inputGradientValues.Print("child Gradient-out");
-#endif
-    }
+// -----------------------------------------------------------------------
+// TransposeTimesNode (A', B)
+// right operand and output can have MB layout, while left operand cannot
+// TODO: merge with TimesNode
+// -----------------------------------------------------------------------
 
-    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+template <class ElemType>
+class TransposeTimesNode : public TimesNodeBase<ElemType, true>
+{
+    typedef TimesNodeBase<ElemType, true> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
     {
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-        sliceOutputValue.AssignProductOf(Input(0)->Value(), true, sliceInput1Value, false);
+        return L"TransposeTimes";
     }
-
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+public:
+    DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
+    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
-        Base::Validate(isFinalValidationPass);
-        InferMBLayoutFromInputsForStandardCase();
-
-        //support automatic dimension inference for learnable parameters
-        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-        size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
-
-        if (isFinalValidationPass && (rows0 == 0 || (!Input(1)->HasMBLayout() && cols1 == 0)))
-            RuntimeError("TransposeTimes operation: Input(0)->GetNumRows() and Input(1)->GetNumCols() should not be 0 since it cannot be automatically inferred");
-
-        if (cols0 == 0 && rows1 != 0 && isFinalValidationPass)
-            ValidateInferInputDims(0, rows0, rows1);
-
-        if (cols0 != 0 && rows1 == 0)
-            ValidateInferInputDims(1, cols0, cols1);
-
-        //cols0 and rows1 may have been changed so don't use them in the following check
-        if (isFinalValidationPass && Input(1)->GetNumRows() != Input(0)->GetNumRows())
-            LogicError("The Matrix dimension in the TransposeTimes operation does not match.");
-
-        // TODO: What should the tensor story be?
-        //after multiplication the structure is lost
-        SetDims(TensorShape(cols0), cols1);
     }
 };
 
@@ -459,9 +395,9 @@ class ElementTimesNode : public BinaryElementWiseNode<ElemType>
         auto otherInputValue = Input(1 - inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
 
         // if reduction then mask the respective input(s) (zero out the gaps)
-        if (Input(inputIndex)->GetNumCols() < GetNumCols())
+        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
             MaskMissingGradientColumnsToZero(fr);
-        if (Input(inputIndex)->GetNumCols() < Input(1 - inputIndex)->GetNumCols())
+        if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
             Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
 
         inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
@@ -516,14 +452,14 @@ class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
             Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(fr); // use Masked- version since this is reducing over frames
             Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(fr);
             m_innerproduct->AssignInnerProductOf(sliceOutputGrad, sliceInput1Value, false);
-            Input(0)->Gradient() += *m_innerproduct;
+            Input(0)->GradientAsMatrix() += *m_innerproduct;
         }
         else // right derivative
         {
             Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
             Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
             m_rightGradient->SetValue(sliceOutputGrad);
-            m_rightGradient->ColumnElementMultiplyWith(Input(0)->Value());
+            m_rightGradient->ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
             sliceInput1Grad += *m_rightGradient;
         }
     }
@@ -541,7 +477,7 @@ class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
         sliceOutputValue.SetValue(sliceInput1Value);
-        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->Value());
+        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@@ -549,23 +485,35 @@ class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        //if dimension not specified we assume two operands' dimensions should match
-        if (Input(0)->GetNumRows() == 0 && Input(1)->GetNumRows() != 0)
-            ValidateInferInputDims(0, Input(1)->GetNumRows(), 1);
+        size_t rows0 = Input(0)->GetAsMatrixNumRows();
+        size_t rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
 
-        if (Input(0)->GetNumRows() != 0 && Input(1)->GetNumRows() == 0)
-            ValidateInferInputDims(1, Input(0)->GetNumRows(), Input(1)->GetNumCols());
+        // if dimension not specified we assume two operands' dimensions should match
+        Input(0)->ValidateInferInputDimsFrom(TensorShape(rows1));
 
-        if (isFinalValidationPass)
+        if (Input(1)->HasMBLayout())
         {
-            if (Input(1)->GetNumRows() != Input(0)->GetNumRows())
-                LogicError("The Matrix dimension in the DiagTimes operation does not match.");
-
-            if (Input(0)->GetNumCols() != 1)
-                LogicError("The first matrix should be a vector representing the diagonal of a square matrix in the DiagTimes operation.");
+            // infer rows1 as rows0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0));
+            SetDims(TensorShape(rows0), true);
+        }
+        else    // multiplying two straight matrices
+        {
+            size_t cols1 = Input(1)->GetAsMatrixNumCols();
+            // infer rows1 as rows0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0, cols1));
+            SetDims(TensorShape(rows0, cols1), false);
         }
 
-        // TODO: Should Input(0) have a specific tensor structure? E.g. match Input(1)?
+        // update after inference
+        rows0 = Input(0)->GetAsMatrixNumRows();
+        rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
+        if (isFinalValidationPass && rows0 != rows1)
+            InvalidArgument("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int)rows1, (int)rows0);
+        size_t cols0 = Input(0)->GetAsMatrixNumCols();
+        if (isFinalValidationPass && cols0 != 1)
+            InvalidArgument("The first matrix should be a column vector representing the diagonal of a square matrix in the DiagTimes operation.");
+
         SetDims(Input(1));
     }
 
@@ -654,7 +602,7 @@ class SumElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
     {
         Base::Validate(isFinalValidationPass);
         m_pMBLayout = nullptr; // this node does not hold mini-batch data
-        SetDims(TensorShape(1), 1);
+        SetDims(TensorShape(1), false);
     }
 };
 
@@ -663,7 +611,7 @@ template class SumElementsNode<double>;
 
 // -----------------------------------------------------------------------
 // SumColumnElementsNode (input)
-// sums up each column of the input
+// sums up all elements in each column of the input, reducing each column to a scalar
 // TODO: This should be deprecated, in favor of a reduce node.
 // TODO: Implement this with the tensor library.
 // -----------------------------------------------------------------------
@@ -687,10 +635,10 @@ class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
     {
-        Matrix<ElemType> sliceInputGrad = Input(0)->GradientFor(fr);
-        Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+        auto sliceInputGrad = Input(0)->GradientFor(fr);
+        auto sliceOutputGrad = GradientFor(fr);
 
-        sliceInputGrad += sliceOutputGrad; // here the assumption is that gradientValues is a row vector
+        sliceInputGrad += sliceOutputGrad; // here the assumption is that sliceOutputGrad is a row vector
     }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
@@ -710,10 +658,9 @@ class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        auto sliceInputValue = Input(0)->ValueFor(fr);
+        auto sliceOutputValue = ValueFor(fr);   // row vector
 
-        //ForwardPropS(sliceOutputValue, sliceInputValue);
         Matrix<ElemType>::VectorSum(sliceInputValue, sliceOutputValue, true);
     }
 
@@ -722,7 +669,7 @@ class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        SetDims(TensorShape(1), Input(0)->GetNumCols()); // each column is reduced to a scalar
+        SetDims(TensorShape(1), Input(0)->HasMBLayout()); // each column is reduced to a scalar
     }
 };
 
@@ -731,7 +678,7 @@ template class SumColumnElementsNode<double>;
 
 // -----------------------------------------------------------------------
 // TransposeNode (input matrix)
-// TODO: extend towards tensor transpose (swap 2 dimensions)
+// TODO: extend towards tensor transpose (swap 2 dimensions, incl. time)
 // -----------------------------------------------------------------------
 
 template <class ElemType>
@@ -753,18 +700,20 @@ class TransposeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemT
 
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override
     {
-        Matrix<ElemType>& inputGradientValues = Input(0)->Gradient();
-        const Matrix<ElemType>& gradientValues = Gradient();
+        auto inputGradientValues = Input(0)->GradientAsMatrix();
+        auto gradientValues = GradientAsMatrix();
 #if DUMPOUTPUT
         gradientValues.Print("Gradient-in");
         inputGradientValues.Print("child Gradient-in/out");
         inputFunctionValues.Print("child Function values");
 #endif
         const Matrix<ElemType>& ones = ConstOnes(inputGradientValues.GetNumRows(), inputGradientValues.GetNumRows(), inputGradientValues.GetDeviceId());
+        // BUGBUG: This should be ^^ Identity(). This will be fixed once we switch to the more generic tensor Transpose operation, which can handle this easily.
         Matrix<ElemType>::MultiplyAndAdd(ones, false, gradientValues, true, inputGradientValues);
 #if DUMPOUTPUT
         inputGradientValues.Print("child Gradient-out");
 #endif
+        InvalidArgument("TransposeNode::BackpropTo() has a known bug. it is not functional.");
     }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
@@ -789,14 +738,14 @@ class TransposeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemT
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
 #if DUMPOUTPUT
-        Input(0)->Value().Print("TransposeNode- Input0");
+        Input(0)->ValueAsMatrix().Print("TransposeNode- Input0");
 #endif
-        Value().AssignTransposeOf(Input(0)->Value());
+        ValueAsMatrix().AssignTransposeOf(Input(0)->ValueAsMatrix());
 #if NANCHECK
         Value().HasNan("Transpose");
 #endif
 #if DUMPOUTPUT
-        Value().Print("TransposeNode");
+        ValueAsMatrix().Print("TransposeNode");
 #endif
     }
 
@@ -807,10 +756,8 @@ class TransposeNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemT
             InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str());
         m_pMBLayout = nullptr; // this node does not hold mini-batch data
 
-        if (Input(0)->HasSampleLayout()) // must be a plain matrix without tensor substructure
-            InvalidArgument("%ls %ls operation cannot operate on input tensors", NodeName().c_str(), OperationName().c_str());
-        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-        SetDims(TensorShape(cols0), rows0);
+        size_t rows0 = Input(0)->GetAsMatrixNumRows(), cols0 = Input(0)->GetAsMatrixNumCols();
+        SetDims(TensorShape(cols0, rows0), false);
     }
 };
 
@@ -818,7 +765,7 @@ template class TransposeNode<float>;
 template class TransposeNode<double>;
 
 // -----------------------------------------------------------------------
-// DiagonalNode -- extract diagonal elements of a square matrix
+// DiagonalNode -- extract diagonal elements of a square matrix into a row vector
 // -----------------------------------------------------------------------
 
 template <class ElemType>
@@ -838,45 +785,6 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
     {
     }
 
-    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
-    {
-        Base::CopyTo(nodeP, newName, flags);
-        if (flags & CopyNodeFlags::copyNodeValue)
-        {
-            auto node = dynamic_pointer_cast<DiagonalNode<ElemType>>(nodeP);
-        }
-    }
-
-    virtual void PrintSelfBeforeValidation(bool allowNulls = false) const
-    {
-        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
-
-        if (!IsLeaf())
-        {
-            fprintf(stderr, "(");
-            for (size_t i = 0; i < GetNumInputs(); i++)
-            {
-                ComputationNodePtr child = Input(i);
-                if (i > 0)
-                    fprintf(stderr, ", ");
-
-                if (child == nullptr)
-                {
-                    if (allowNulls)
-                    {
-                        fprintf(stderr, "NULL");
-                        continue;
-                    }
-                    RuntimeError("One of the children is missing.");
-                }
-
-                fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->Value().GetNumRows(), child->Value().GetNumCols());
-            }
-
-            fprintf(stderr, ")");
-        }
-    }
-
     virtual void Validate(bool isFinalValidationPass) override
     {
         Base::Validate(isFinalValidationPass);
@@ -885,19 +793,20 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
         if (isFinalValidationPass && Input(0)->HasMBLayout())
             InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str());
 
-        size_t dim = Input(0)->GetNumCols();
-        if (isFinalValidationPass && dim != Input(0)->GetNumRows())
+        size_t dim = Input(0)->GetAsMatrixNumCols();
+        if (isFinalValidationPass && dim != Input(0)->GetAsMatrixNumRows())
             InvalidArgument("%ls %ls operation requires a square matrix as its input.", NodeName().c_str(), OperationName().c_str());
 
         if (Input(0)->HasSampleLayout())
             fprintf(stderr, "WARNING: Diagonal operation cannot inherit image size information from its child. Image size info is lost.\n");
 
-        SetDims(TensorShape(1), dim);
+        SetDims(TensorShape(1, dim), false);
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
-        Input(0)->Value().AssignDiagonalValuesTo(Value());
+        auto value = ValueAsMatrix();
+        Input(0)->ValueAsMatrix().AssignDiagonalValuesTo(value);  // TODO: use tensor lib; this is a stride operation
 #if NANCHECK
         Value().HasNan("Diagonal");
 #endif
@@ -905,10 +814,11 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
 
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override
     {
-        Matrix<ElemType>& inputGradientValues = Input(0)->Gradient();
-        const Matrix<ElemType>& gradientValues = Gradient();
+        auto inputGradientValues = Input(0)->GradientAsMatrix();
+        auto gradientValues = GradientAsMatrix();
 
-        // BUGBUG: This should use the memshare mechanism
+        // BUGBUG: This should use the memshare mechanism.
+        // TODO: use tensor lib, then this will be easy, no memsharing needed
         Matrix<ElemType> diag(gradientValues.GetNumRows(), gradientValues.GetNumCols(), gradientValues.GetDeviceId());
         diag = gradientValues;
         diag.Resize(gradientValues.GetNumCols(), 1);
@@ -943,7 +853,6 @@ template class DiagonalNode<double>;
 // TODO: Would it be useful to allow one of the two to be a single column?
 // -----------------------------------------------------------------------
 
-//The first matrix should be a vector regpresting the diagonal of a square matrix in the DiagTimes operation
 template <class ElemType>
 class CosDistanceNode : public ComputationNode<ElemType>, public NumInputs<2>
 {
@@ -965,9 +874,9 @@ class CosDistanceNode : public ComputationNode<ElemType>, public NumInputs<2>
     {
         // functionValues, invNorm0, invNorm1 - output from the EvaluateNode() method
         // temp, rightTerm, leftTerm - temporary matrices
-        if (inputIndex == 0) //left derivative
+        if (inputIndex == 0)    // left derivative
             m_temp->AssignElementProductOf(*m_invNorm0, *m_invNorm0);
-        else //right derivative
+        else                    // right derivative
             m_temp->AssignElementProductOf(*m_invNorm1, *m_invNorm1);
 
         m_temp->ElementMultiplyWith(ValueFor(fr));
@@ -1008,14 +917,9 @@ class CosDistanceNode : public ComputationNode<ElemType>, public NumInputs<2>
 
         ValidateInferBinaryInputDims();
 
-#if 0
-            if (isFinalValidationPass && (Input(1)->GetNumRows() != Input(0)->GetNumRows() || (HasMBLayout() && (Input(1)->GetNumCols() != Input(0)->GetNumCols()))))
-                LogicError("%ls %ls operation: The input dimensions do not match.", NodeName().c_str(), OperationName().c_str());
-#endif
-
         // TODO: We could do something more interesting with tensors.
         //       E.g. apply a cos distance of a whole set of data with a single reference.
-        SetDims(TensorShape(1), Input(1)->GetNumCols());
+        SetDims(TensorShape(1), Input(1)->HasMBLayout());
     }
 
     virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -1074,6 +978,7 @@ template class CosDistanceNode<double>;
 
 // -----------------------------------------------------------------------
 // KhatriRaoProductNode (left, right)
+// compute an outer product of column vectors (for each sample)
 // -----------------------------------------------------------------------
 
 template <class ElemType>
@@ -1130,22 +1035,12 @@ class KhatriRaoProductNode : public ComputationNode<ElemType>, public NumInputs<
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        //support automatic dimension inference for learnable parameters
-        size_t rows0 = Input(0)->GetNumRows(), cols0 = Input(0)->GetNumCols();
-        size_t rows1 = Input(1)->GetNumRows(), cols1 = Input(1)->GetNumCols();
-
-        if (cols0 == 0 && cols1 != 0)
-            ValidateInferInputDims(0, rows0, cols1);
-
-        if (cols0 != 0 && cols1 == 0)
-            ValidateInferInputDims(1, rows1, cols0);
-
-        if (isFinalValidationPass && !HasMBLayout() && Input(1)->GetNumCols() != Input(0)->GetNumCols())
-            LogicError("The Matrices should have same number of columns.");
+        size_t rows0 = Input(0)->GetSampleMatrixNumRows();
+        size_t rows1 = Input(1)->GetSampleMatrixNumRows();
 
         // after KhatriRaoProduct the structure is lost
-        // TODO: ^^ Is that correctWhat is the correct sample layout?
-        SetDims(TensorShape(rows0 * rows1), Input(0)->GetNumCols());
+        // TODO: ^^ Is that correct? Should we use a tensor here, TensorShape(rows0, rows1)?
+        SetDims(TensorShape(rows0 * rows1), HasMBLayout());
     }
 };
 
@@ -1179,21 +1074,8 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode<ElemType>, pub
     {
     }
 
-    void BackpropToMap(const size_t inputIndex)
-    {
-        if (inputIndex > 1)
-            InvalidArgument("CosDistanceWithNegativeSamples operation only takes grdients on the first two inputs.");
-
-        BackpropToS(inputIndex, *m_invNorm0, *m_invNorm1, Value(), *m_temp, *m_rightTerm, *m_leftTerm, *m_invNormSquare, Input(0)->Value(), Input(1)->Value(), Input(2)->Value(), Input(3)->Value(), Input(inputIndex)->Gradient(), Gradient());
-    }
-
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        if (fr.IsAllFrames())
-        {
-            BackpropToMap(inputIndex);
-            return;
-        } // TODO: remove these one by one
         Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
         Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
@@ -1304,14 +1186,8 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode<ElemType>, pub
         }
     }
 
-    void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-    {
-        ForwardPropS(*m_invNorm0, *m_invNorm1, Value(), Input(0)->Value(), Input(1)->Value(), Input(2)->Value(), Input(3)->Value(), *m_leftTerm, *m_rightTerm);
-    }
-
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
         Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
         Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
@@ -1348,34 +1224,20 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode<ElemType>, pub
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        //if dimension is missing make the two operatants to have same size
-        // TODO: use a for loop?? Or don't we have a function for this?
-        size_t index = 0;
-        {
-            size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
-            size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
-            ValidateInferInputDims(index, rows, cols);
-        }
-
-        index = 1;
-        {
-            size_t rows = Input(index)->GetNumRows() == 0 ? Input(1 - index)->GetNumRows() : Input(index)->GetNumRows();
-            size_t cols = Input(index)->GetNumCols() == 0 ? Input(1 - index)->GetNumCols() : Input(index)->GetNumCols();
-            ValidateInferInputDims(index, rows, cols);
-        }
+        ValidateInferBinaryInputDims();
 
         if (isFinalValidationPass &&
-            (Input(1)->GetNumRows() != Input(0)->GetNumRows() ||
-             (!Input(1)->GetMBLayout() && Input(1)->GetNumCols() != Input(0)->GetNumCols())))
+            (Input(0)->GetSampleMatrixNumRows() != Input(1)->GetSampleMatrixNumRows()
+             || Input(0)->GetMBLayout() != Input(1)->GetMBLayout()))
         {
-            LogicError("The Matrix dimension in the %ls %ls operation does not match.", NodeName().c_str(), OperationName().c_str());
+            LogicError("The tensor dimension in the %ls %ls operation does not match.", NodeName().c_str(), OperationName().c_str());
         }
 
         // input(2) is shift, input(3) is the #neg
         size_t negNumber = (size_t) Input(3)->Get00Element();
 
         // TODO: This calls for a tensor representation!
-        SetDims(TensorShape(negNumber + 1), Input(1)->GetNumCols());
+        SetDims(TensorShape(negNumber + 1), HasMBLayout());
     }
 
     virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index 97102b584f10..e464b613a17a 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -365,9 +365,14 @@ template class LogSoftmaxNode<double>;
 
 // -----------------------------------------------------------------------
 // GMMLogLikelihoodNode (unnormedPrior, means, logStdDevs, features) -- GMM log LL over input vector(s)
+// calculates the log likelihood of a feature given parameters of a Gaussian mixture model (GMM) with shared diagonal variance
+//  - unnormedPrior: mix weights, #rows = #mixture components
+//  - means: means, all mix means concatenated  (i.e. dim = feature dim x prior dim)
+//  - logStdDevs: std deviations, pooled across mix (i.e. same dim as features)
+// UnnormedPrior, means, and logStdDevs can be either a single column or one per sample, e.g.
+// when parameters are computed by other nodes.
 // -----------------------------------------------------------------------
 
-//calculates: the log likelihood of a feature given GMM parameters
 template <class ElemType>
 class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<4>
 {
@@ -388,7 +393,7 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
         // get the right slice
-        const size_t colsPrior = Input(0)->GetNumCols();
+        const size_t colsPrior = Input(0)->GetSampleMatrixNumCols();
 
         Matrix<ElemType> sliceGradientValue = DataFor(*m_gradient, fr);
         Matrix<ElemType> slicePosterior = DataFor(*m_posterior, fr);
@@ -402,7 +407,7 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
             else
             {
                 Matrix<ElemType> sliceUnnormedPriorGradient = Input(0)->GradientFor(fr);
-                Matrix<ElemType> slicePrior = DataFor(*m_prior, fr);
+                Matrix<ElemType> slicePrior = DataFor(*m_prior, fr);    // TODO: use the right MBLayout, then we won't need the special case
                 BackpropToUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, *m_temp);
             }
         }
@@ -458,8 +463,8 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
         return false;
     }
 
-    /*TODO: merge with call site*/ void BackpropToUnnormedPrior(Matrix<ElemType>& unnormedPriorGradientValues, const Matrix<ElemType>& gradientValues,
-                                                                const Matrix<ElemType>& prior, const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+    void BackpropToUnnormedPrior(Matrix<ElemType>& unnormedPriorGradientValues, const Matrix<ElemType>& gradientValues,
+                                 const Matrix<ElemType>& prior, const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
     {
         temp.AssignDifferenceOf(posterior, prior);
         temp.RowElementMultiplyWith(gradientValues);
@@ -471,8 +476,8 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
             RuntimeError("GMMLogLikelihoodNode: UnnormedPrior should either have same number of columns as the features or have only one column.");
     }
 
-    /*TODO: merge with call site*/ void BackpropToMean(Matrix<ElemType>& meanGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
-                                                       Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+    void BackpropToMean(Matrix<ElemType>& meanGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
+                        Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
     {
         size_t numComponent = posterior.GetNumRows();
         size_t numSamples = posterior.GetNumCols();
@@ -497,8 +502,8 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
             RuntimeError("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
     }
 
-    /*TODO: merge with call site*/ void BackpropToLogStddev(Matrix<ElemType>& logStddevGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviation,
-                                                            const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+    void BackpropToLogStddev(Matrix<ElemType>& logStddevGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviation,
+                             const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
     {
         size_t numComponent = posterior.GetNumRows();
         size_t numSamples = posterior.GetNumCols();
@@ -514,8 +519,8 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
             RuntimeError("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
     }
 
-    /*TODO: merge with call site*/ void BackpropToFeature(Matrix<ElemType>& featureGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
-                                                          Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+    void BackpropToFeature(Matrix<ElemType>& featureGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
+                           Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
     {
         size_t numComponent = posterior.GetNumRows();
         size_t numSamples = posterior.GetNumCols();
@@ -539,10 +544,10 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
     {
         Base::UpdateFunctionMBSize();
 
-        size_t numCols = Input(3)->GetNumCols();
-        size_t numComponents = Input(0)->GetNumRows();
-        size_t colsPrior = Input(0)->GetNumCols();
-        size_t featureSize = Input(3)->GetNumRows();
+        size_t numCols = Input(3)->GetSampleMatrixNumCols();
+        size_t numComponents = Input(0)->GetSampleMatrixNumRows();
+        size_t colsPrior = Input(0)->GetSampleMatrixNumCols();  // may be 1
+        size_t featureSize = Input(3)->GetSampleMatrixNumRows();
 
         m_prior->Resize(numComponents, colsPrior);
         m_stddev->Resize(numComponents, colsPrior);
@@ -551,22 +556,13 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
         m_posterior->Resize(numComponents, numCols);
     }
 
-    //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
-    void ForwardPropMap() // TODO: This is a stop-gap; in most cases, we should just be able to delete this (but need to review one by one)
-    {
-        // all internal matrices will be automatically resized since all of them are assigned to a value so no resize is needed here.
-        ForwardPropS(Value(), Input(0)->Value(), Input(1)->Value(), Input(2)->Value(), Input(3)->Value(),
-                     *m_prior, *m_stddev, *m_normedDeviationVectors, *m_normedDeviation, *m_posterior, *m_temp);
-    }
-
-    //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
+    // input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        //if (fr.IsAllFrames()) { ForwardPropMap(); return; }
-        size_t colsPrior = Input(0)->GetNumCols();
-        size_t numSamples = Input(3)->GetNumCols();
+        size_t colsPrior = Input(0)->GetSampleMatrixNumCols();
+        size_t numSamples = Input(3)->GetSampleMatrixNumCols();
 
-        //get the right slice
+        // get the right slice
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
         Matrix<ElemType> sliceFeature = Input(3)->ValueFor(fr);
         Matrix<ElemType> sliceNormedDeviation = DataFor(*m_normedDeviation, fr);
@@ -675,20 +671,16 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        size_t rows[4], cols[4];
+        size_t rows[4];
         for (int i = 0; i < 4; i++)
-        {
-            rows[i] = Input(i)->GetNumRows();
-            cols[i] = Input(i)->GetNumCols();
-        }
+            rows[i] = Input(i)->GetSampleMatrixNumRows();
 
         if (isFinalValidationPass)
         {
-            if (cols[0] != cols[1] || cols[0] != cols[2])
-                LogicError("GMMLogLikelihoodNode: UnnormedPrior (first input), mean (second input), and logStddev (third input) should have same number of columns.");
-
-            if (cols[0] != 1 && cols[0] != cols[3])
-                LogicError("GMMLogLikelihoodNode: UnnormedPrior (first input) should either have same number of columns as the features (fourth input) or have only one column.");
+            if (!Input(3)->HasMBLayout())
+                InvalidArgument("GMMLogLikelihoodNode: Features must be a minibatch.");
+            if (Input(0)->GetMBLayout() != Input(1)->GetMBLayout() || Input(0)->GetMBLayout() != Input(2)->GetMBLayout())
+                InvalidArgument("GMMLogLikelihoodNode: First three arguments must have the same MBLayout (which may be none).");
 
             if (rows[0] != rows[2])
                 LogicError("GMMLogLikelihoodNode: UnnormedPrior (first input) should have same dimension as logStddev (third input), i.e., all dimensions in each Gaussian component share the same stddev.");
@@ -697,7 +689,7 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
                 LogicError("GMMLogLikelihoodNode: the number of rows in mean (second input) should equal rows(unnormedPrior(first input) * rows(feature(fourth input)).");
         }
 
-        SetDims(TensorShape(1), cols[3]);
+        SetDims(TensorShape(1), true);
     }
 
     virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index f404f51d4267..da8fa27eecbe 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -547,7 +547,7 @@ class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, publi
         m_pMBLayout = Input(0)->GetMBLayout();
         if (isFinalValidationPass && !m_pMBLayout)
             InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());
-        if (isFinalValidationPass && !Input(1)->GetMBLayout() && Input(1)->GetNumCols() != 1)
+        if (isFinalValidationPass && !Input(1)->GetMBLayout() && Input(1)->GetSampleMatrixNumCols() != 1)
             InvalidArgument("%ls %ls operation requires the boundary node to have one column.", NodeName().c_str(), OperationName().c_str());
 
         // as is the sample layout
@@ -778,8 +778,8 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         Base::Save(fstream);
 
         fstream << m_timeStep;
-            size_t colsDummy = 0;
-            fstream << GetNumRows() << colsDummy;
+        size_t colsDummy = 0;
+        fstream << GetSampleMatrixNumRows() << colsDummy;   // #rows saved for legacy file format
 
         fstream << m_initialActivationValue;
     }
@@ -912,8 +912,6 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         // we forward prop from the previous frame to this frame
         FrameRange frDelayed = fr.WithTimeOffset(direction * m_timeStep);
 
-        VerifyDims(Input(0));
-
         size_t T = GetNumTimeSteps();
         size_t T_delayedActivation = m_delayedActivationMBLayout ? m_delayedActivationMBLayout->GetNumTimeSteps() : 0; // (note: should never happen in full-sequence mode)
 
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 9235bf09da52..a9814664ee2d 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -224,7 +224,7 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
             if (!child)
                 fprintf(stderr, "NULL");
             else
-                fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
+                fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->GetSampleLayout()).c_str(), child->HasMBLayout() ? " x *" : "");
         }
         fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
         // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
@@ -243,38 +243,44 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
         else
             assert(!m_pMBLayout); // reshaping non-mini-batch data
 
-        size_t rows = Input(0)->GetNumRows(), cols = Input(0)->GetNumCols();
-        // Note: During initial validation, cols may not be a multiple. E.g. cols may be 1 or 3. So we cannot check here whether the integer-multiple conditions are fulfilled.
-        size_t newCols = cols * rows / m_numTargetRows;
-        if (isFinalValidationPass)
+        size_t newCols = 1; // dummy
+        if (!m_pMBLayout)
         {
-            if ((m_numTargetRows > rows && m_numTargetRows % rows != 0) || // grouping columns
-                (m_numTargetRows < rows && rows % m_numTargetRows != 0))   // splitting columns
-                InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int) m_numTargetRows, (int) rows);
-            if (!m_pMBLayout && rows * cols != m_numTargetRows * newCols) // sadly, cannot verify here if we have a layout, since current #cols may be bogus
-                LogicError("%ls %ls operation: unexpected dimension mismatch", NodeName().c_str(), OperationName().c_str());
+            size_t rows = Input(0)->GetAsMatrixNumRows(), cols = Input(0)->GetAsMatrixNumCols();
+            newCols = cols * rows / m_numTargetRows;
+            if (isFinalValidationPass)
+            {
+                if ((m_numTargetRows > rows && m_numTargetRows % rows != 0) || // grouping columns
+                    (m_numTargetRows < rows && rows % m_numTargetRows != 0))   // splitting columns
+                    InvalidArgument("%ls %ls operation: output row dimension %d is not an integer multiple or divisor of input dimension %d", NodeName().c_str(), OperationName().c_str(), (int)m_numTargetRows, (int)rows);
+                if (rows * cols != m_numTargetRows * newCols)
+                    LogicError("%ls %ls operation: unexpected dimension mismatch", NodeName().c_str(), OperationName().c_str());
+            }
         }
 
         // patch up m_targetImageLayout, which was originally a construction parameter
         InferTargetSampleLayout();
 
         // setting any dimension to 0 means lose the tensor, flatten to vector
-        // TODO: We can use 0 to indicate "infer". One value can be 0. It will be filled in to match row dim.
-        if (m_targetImageLayout[1] == 0 || m_targetImageLayout[2] == 0 || m_targetImageLayout[0] == 0)
+        if (m_targetImageLayout.GetNumElements() == 0)
         {
             if (Input(0)->HasSampleLayout())
                 fprintf(stderr, "WARNING: Reshape operation cannot inherit image size information from its child. Image size info is lost.\n");
             // TODO: We need to decide what reshaping means in presence of a tensor.
-            SetDims(TensorShape(m_numTargetRows), newCols);
+            if (HasMBLayout())
+                SetDims(TensorShape(m_numTargetRows), true);
+            else
+                SetDims(TensorShape(m_numTargetRows, newCols), false);
         }
         else
         {
             if (m_numTargetRows != m_targetImageLayout.GetNumElements())
                 LogicError("DeprecatedReshapeNode: InferTargetSampleLayout() computed a sample layout [%s] that mismatches m_numTargetRows %d.", string(m_targetImageLayout).c_str(), (int) m_numTargetRows);
-            SetDims(m_targetImageLayout, newCols);
+            SetDims(m_targetImageLayout, HasMBLayout());
         }
     }
 
+#if 0
     virtual void UpdateFunctionMBSize() override
     {
         size_t rows = Input(0)->GetNumRows(), cols = Input(0)->GetNumCols();
@@ -288,6 +294,7 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
         else
             SetNumCols(newCols);
     }
+#endif
 
     // TODO: there seems to be semantic overlap between BeginForwardProp() and UpdateFunctionMBSize()
     virtual void /*IComputationNode::*/ BeginForwardProp() override
@@ -301,7 +308,7 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
             if (weStack())
             {
                 // going from many samples to one: layout entry will get no flags
-                if (Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows != 1)
+                if (Input(0)->GetMBLayout()->GetNumTimeSteps() * Input(0)->GetSampleMatrixNumRows() / m_numTargetRows != 1)
                     LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to remove a nested time dimension only works when going back to a single frame per sequence.");
                 // we are in frame mode now
                 m_pMBLayout->InitAsFrameMode(Input(0)->GetNumParallelSequences());
@@ -311,9 +318,9 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
                 // going from one sample to many: layout will get SentenceStart/SentenceEnd flags for the sequence we expand into
                 if (Input(0)->GetMBLayout()->GetNumTimeSteps() != 1)
                     LogicError("DeprecatedReshapeNode::BeginForwardProp() faking to add a nested time dimension only works when coming from a single frame per sequence.");
-                m_pMBLayout->Init(Input(0)->GetNumParallelSequences(), Input(0)->GetNumTimeSteps() * Input(0)->GetNumRows() / m_numTargetRows);
+                m_pMBLayout->Init(Input(0)->GetNumParallelSequences(), Input(0)->GetMBLayout()->GetNumTimeSteps() * Input(0)->GetSampleMatrixNumRows() / m_numTargetRows);
                 for (size_t s = 0; s < m_pMBLayout->GetNumParallelSequences(); s++)
-                    m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, s, 0, m_pMBLayout->GetNumTimeSteps());
+                    m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, s, 0, GetMBLayout()->GetNumTimeSteps());
                 // BUGBUG: In the future, NEW_SEQUENCE_ID will be incorrect here; need an iterator over sequences in there.
             }
         }
@@ -324,10 +331,10 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
     //  - fr refers to *functionValues*, not the inputs
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        size_t rows = Input(0)->GetNumRows(), cols = Input(0)->GetNumCols();
+        size_t rows = Input(0)->Value().GetNumRows(), cols = Input(0)->Value().GetNumCols();
         size_t newCols = cols * rows / m_numTargetRows;
         assert(newCols * m_numTargetRows == cols * rows); // follows from above check
-        VerifyDims(m_numTargetRows, newCols);
+        Value().VerifySize(m_numTargetRows, newCols);
 
         // no layout case: this is indeed just a reshape. Same for canonical case
         // (We still need to copy the values since there is currently no way to point to an input function value while reshaping at the same time.)
@@ -351,7 +358,7 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
     {
-        size_t rows = Input(0)->GetNumRows(), cols = Input(0)->GetNumCols();
+        size_t rows = Input(0)->Value().GetNumRows(), cols = Input(0)->Value().GetNumCols();
         size_t newCols = cols * rows / m_numTargetRows;
 
         // no layout case: this is indeed just a reshape. Same for canonical case
@@ -388,11 +395,11 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
     size_t m_numTargetRows;
     bool weStack() const
     {
-        return m_numTargetRows > Input(0)->GetNumRows();
+        return m_numTargetRows > Input(0)->GetSampleMatrixNumRows();
     } // do we stack (multiple frames into one)
     size_t factor() const
     {
-        return m_numTargetRows > Input(0)->GetNumRows() ? m_numTargetRows / Input(0)->GetNumRows() : Input(0)->GetNumRows() / m_numTargetRows;
+        return m_numTargetRows > Input(0)->GetSampleMatrixNumRows() ? m_numTargetRows / Input(0)->GetSampleMatrixNumRows() : Input(0)->GetSampleMatrixNumRows() / m_numTargetRows;
     } // factor by which we stack or unstack
     TensorShape m_targetImageLayout;
 
@@ -400,7 +407,6 @@ class DeprecatedReshapeNode : public ReinterpretNodeBase<ElemType>
     // Users are allowed to provide 2 (out of 3) image dimensions.
     // One missing dimension can be inferred. If two dimensions are
     // unspecified it throws a runtime error.
-    // TODO: Generalize this to any number of dimensions.
     void InferTargetSampleLayout()
     {
         // BUGBUG: Below is the result of refactoring and only works for rank-3 tensors. Generalize.
@@ -772,15 +778,16 @@ class RowSliceNode : public ComputationNode<ElemType>, public NumInputs<1>
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        if (isFinalValidationPass && Input(0)->GetNumRows() < m_startIndex + m_sliceHeight)
+        if (isFinalValidationPass && Input(0)->GetSampleMatrixNumRows() < m_startIndex + m_sliceHeight)
             RuntimeError("%ls %ls operation: m_startIndex + m_sliceHeight exceeds number of rows in the input.", NodeName().c_str(), OperationName().c_str());
 
         // RowSlice cannot slice tensors.
         // TODO: Create a TensorSlice operation, or just Slice.
-        if (isFinalValidationPass && Input(0)->HasSampleLayout() && !Input(0)->GetSampleLayout().IsVectorStoredAsImage() // legacy
+        if (isFinalValidationPass && Input(0)->HasSampleLayout()
+            && !Input(0)->GetSampleLayout().IsVectorStoredAsImage() // legacy
             )
             RuntimeError("%ls %ls operation: Input must be a vector, tensor shape [%s] not allowed.", NodeName().c_str(), OperationName().c_str(), string(Input(0)->GetSampleLayout()).c_str());
-        SetDims(TensorShape(m_sliceHeight), Input(0)->GetNumCols());
+        SetDims(TensorShape(m_sliceHeight), HasMBLayout());
     }
 
 private:
@@ -824,7 +831,7 @@ class RowStackNode : public ComputationNode<ElemType> // note: not deriving from
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        Input(inputIndex)->GradientFor(fr).AddWithRowSliceValuesOf(GradientFor(fr), m_startRowIndices[inputIndex], Input(inputIndex)->GetNumRows());
+        Input(inputIndex)->GradientFor(fr).AddWithRowSliceValuesOf(GradientFor(fr), m_startRowIndices[inputIndex], Input(inputIndex)->GetSampleMatrixNumRows());
     }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
@@ -845,7 +852,7 @@ class RowStackNode : public ComputationNode<ElemType> // note: not deriving from
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
         for (size_t inputIndex = 0; inputIndex < GetNumInputs(); inputIndex++)
-            ValueFor(fr).AssignToRowSliceValuesOf(Input(inputIndex)->ValueFor(fr), m_startRowIndices[inputIndex], Input(inputIndex)->GetNumRows());
+            ValueFor(fr).AssignToRowSliceValuesOf(Input(inputIndex)->ValueFor(fr), m_startRowIndices[inputIndex], Input(inputIndex)->GetSampleMatrixNumRows());
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@@ -853,13 +860,11 @@ class RowStackNode : public ComputationNode<ElemType> // note: not deriving from
         Base::Validate(isFinalValidationPass);
         InferMBLayoutFromInputsForStandardCase();
 
-        size_t numCols = Input(0)->GetNumCols();
-
         // we must fuse all tensor shapes
-        // All dimensions but the last must be the same.
-        // Note that trailing ones may be stripped, so we must first pad.
+        // All dimensions but the last must be the same. (In a future version, we should be able to stack along any given dimension.)
+        // Note that trailing ones may be stripped/broadcasting, so we must first pad.
         SmallVector<size_t> dims = Input(0)->GetSampleLayout().GetDims();
-        size_t maxRank = 0;
+        size_t maxRank = 0; // TODO: very similar to DetermineElementwiseTensorRank() except that that one also includes the output
         for (int i = 0; i < GetNumInputs(); i++)
             if (maxRank < GetInputSampleLayout(i).GetRank())
                 maxRank = GetInputSampleLayout(i).GetRank();
@@ -871,11 +876,8 @@ class RowStackNode : public ComputationNode<ElemType> // note: not deriving from
         size_t totalTrailingDim = 0; // last tensor dimension is what gets stacked up
         for (int i = 0; i < GetNumInputs(); i++)
         {
-            if (isFinalValidationPass && !HasMBLayout() && Input(i)->GetNumCols() != numCols)
-                LogicError("RowStack operation: the input node %ls has different number of columns.", Input(i)->NodeName().c_str());
-
             m_startRowIndices[i] = totalRows;
-            totalRows += Input(i)->GetNumRows();
+            totalRows += Input(i)->GetSampleMatrixNumRows();
             SmallVector<size_t> thisDims = Input(i)->GetSampleLayout().GetDims();
             thisDims.resize(maxRank, 1);         // pad and/or strip trailing dimension
             totalTrailingDim += thisDims.back(); // count total trailing dimensions (that's what we have after stacking)
@@ -891,9 +893,9 @@ class RowStackNode : public ComputationNode<ElemType> // note: not deriving from
             fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
 
         dims.push_back(totalTrailingDim);
-        SetDims(TensorShape(dims), numCols);
+        SetDims(TensorShape(dims), HasMBLayout());
 
-        if (totalRows != GetNumRows())
+        if (totalRows != GetSampleMatrixNumRows())
             LogicError("%ls RowStack operation: Tensor shapes of inputs were not compatible after all?", NodeName().c_str());
     }
 
@@ -952,34 +954,10 @@ class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
         fstream >> m_numRepeat;
     }
 
-    virtual void PrintSelfBeforeValidation(bool allowNulls = false) const
+    virtual void PrintSelfBeforeValidation() const override
     {
-        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
-
-        if (!IsLeaf())
-        {
-            fprintf(stderr, "(");
-            for (size_t i = 0; i < GetNumInputs(); i++)
-            {
-                ComputationNodePtr child = Input(i);
-                if (i > 0)
-                    fprintf(stderr, ", ");
-
-                if (child == nullptr)
-                {
-                    if (allowNulls)
-                    {
-                        fprintf(stderr, "NULL");
-                        continue;
-                    }
-                    RuntimeError("One of the children is missing.");
-                }
-
-                fprintf(stderr, "%ls[%lu, %lu]", child->NodeName().c_str(), child->GetNumRows(), child->GetNumCols());
-            }
-
-            fprintf(stderr, ", numRepeats=%lu)", m_numRepeat);
-        }
+        Base::PrintSelfBeforeValidation();
+        fprintf(stderr, ", numRepeats=%lu", m_numRepeat);
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@@ -992,7 +970,7 @@ class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
         SmallVector<size_t> dims = GetInputSampleLayout(0).GetDims();
         dims.back() *= m_numRepeat;
 
-        SetDims(TensorShape(dims), Input(0)->GetNumCols());
+        SetDims(TensorShape(dims), HasMBLayout());
     }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index 798d7ba22969..4548c73d7d81 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -57,7 +57,7 @@ class SquareErrorNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
 
     virtual void UpdateFunctionMBSize() override
     {
-        m_leftMinusRight->Resize(Input(0)->GetNumRows(), Input(0)->GetNumCols());
+        m_leftMinusRight->Resize(Input(0)->Value());
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
@@ -66,7 +66,7 @@ class SquareErrorNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
         m_leftMinusRight->AssignDifferenceOf(Input(0)->ValueFor(fr), Input(1)->ValueFor(fr));
         MaskMissingColumnsToZero(*m_leftMinusRight, Input(0)->GetMBLayout(), fr); // we are fine since it will only be called with full minibatch.
         ElemType v = m_leftMinusRight->FrobeniusNorm();
-        VerifyDims(1, 1);
+        Value().VerifySize(1, 1);
         Value().SetValue(v * v / 2);
 #if NANCHECK
         Value().HasNan("SquareError");
@@ -177,8 +177,8 @@ class CrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /*Computati
 
     virtual void UpdateFunctionMBSize() override
     {
-        m_logSoftmaxOfRight->Resize(Input(1)->GetNumRows(), Input(1)->GetNumCols());
-        m_softmaxOfRight->Resize(m_logSoftmaxOfRight->GetNumRows(), m_logSoftmaxOfRight->GetNumCols());
+        m_logSoftmaxOfRight->Resize(Input(1)->Value());
+        m_softmaxOfRight->Resize(*m_logSoftmaxOfRight);
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override //-sum(left_i * log(softmax_i(right)))
@@ -295,8 +295,8 @@ class CrossEntropyNode : public ComputationNodeNonLooping /*ComputationNode*/<El
 
     virtual void UpdateFunctionMBSize() override
     {
-        m_logOfRight->Resize(Input(1)->GetNumRows(), Input(1)->GetNumCols());
-        m_leftDivRight->Resize(Input(1)->GetNumRows(), Input(1)->GetNumCols());
+        m_logOfRight->Resize(Input(1)->Value());
+        m_leftDivRight->Resize(Input(1)->Value());
     }
 
     //-sum(left_i * log(right_i))
@@ -407,13 +407,13 @@ class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
 
     virtual void UpdateFunctionMBSize() override
     {
-        m_gradientOfL1Norm->Resize(Input(0)->GetNumRows(), Input(0)->GetNumCols());
+        m_gradientOfL1Norm->Resize(Input(0)->Value());
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
         FrameRange fr(Input(0)->GetMBLayout());
-        VerifyDims(1, 1);
+        Value().VerifySize(1, 1);
         Value().SetValue(Input(0)->MaskedValueFor(fr).MatrixNorm1());
 #if NANCHECK
         Value().HasNan("MatrixL1Reg");
@@ -495,7 +495,7 @@ class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
         FrameRange fr(Input(0)->GetMBLayout());
-        VerifyDims(1, 1);
+        Value().VerifySize(1, 1);
         Value().SetValue(Input(0)->MaskedValueFor(fr).FrobeniusNorm());
 #if NANCHECK
         Value().HasNan("MatrixL2Reg");
@@ -512,7 +512,14 @@ template class MatrixL2RegNode<float>;
 template class MatrixL2RegNode<double>;
 
 // -----------------------------------------------------------------------
-/// NoiseContrastiveEstimationNode (labels, input, inputWeights, biasWeights)
+// NoiseContrastiveEstimationNode (labels, input, inputWeights, biasWeights)
+//  -labels: label in dense matrix in [4 x T]
+//           the first row is the word index, the second row is the class index, the third row is the first word index of the class
+//           the last row is the first word index of the next class
+//  - input: hidden layer activity to the node in [hdsize x T]. for a simple rnn, this is the hidden layer activty
+//  - inputWeights: weight matrix in [hdsize x vocab_size], for speed-up, as per word matrix can be simply obtained as column slice
+//  - biasWeights: clsprob in dense matrix in [nbr_cls x T]. this is the output from logsoftmax node for the log-posterior probabilty of class given observations
+// */
 // BUGBUG: This node has not been converted to memshare conventions.
 // -----------------------------------------------------------------------
 
@@ -596,9 +603,9 @@ class NoiseContrastiveEstimationNode : public ComputationNodeNonLooping /*Comput
         //                                                                              samples+probs                   hidden                  embedding
         //Input(inputIndex)->GradientFor(fr).AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->Value(), inputIndex);
         if (inputIndex >= 2)
-            Input(inputIndex)->Gradient().AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->Value(), inputIndex);
+            Input(inputIndex)->Gradient().AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->ValueAsMatrix(), inputIndex);
         else
-            Input(inputIndex)->GradientFor(fr).AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->Value(), inputIndex);
+            Input(inputIndex)->GradientFor(fr).AssignNCEDerivative(m_ncePrediction, Input(0)->ValueFor(fr), Input(1)->ValueFor(fr), Input(2)->ValueAsMatrix(), inputIndex);
     }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override
@@ -634,11 +641,11 @@ class NoiseContrastiveEstimationNode : public ComputationNodeNonLooping /*Comput
         FrameRange fr(Input(0)->GetMBLayout());
         if (Input(0)->HasMBLayout() && Input(0)->GetMBLayout()->HasGaps())
             LogicError("%ls %ls operation does not handle multiple parallel sequences with gaps correctly. Contact fseide@microsoft.com if you have a need and a test case.", NodeName().c_str(), OperationName().c_str());
-        //Input(0)->MaskMissingValueColumnsToZero(fr);
+
         int positive = 0, negative = 0;
-        if (Input(0)->GetNumRows() == 1)
+        if (Input(0)->GetSampleLayout().GetNumElements() == 1)
         {
-            for (int i = 0; i < Input(0)->GetNumCols(); i++) // BUGBUG: Loops must be over frames, not columns. Columns may contain gaps.
+            for (int i = 0; i < Input(0)->Value().GetNumCols(); i++) // BUGBUG: Loops must be over frames, not columns. Columns may contain gaps.
             {
                 if (Input(0)->Value()(0, i) > 0)
                     positive++;
@@ -647,38 +654,30 @@ class NoiseContrastiveEstimationNode : public ComputationNodeNonLooping /*Comput
             }
             assert(positive * negative == 0);
         }
-        if (m_evalMode == NCEEvalMode::Softmax || (Input(0)->GetNumRows() == 1 && positive > 0))
+        if (m_evalMode == NCEEvalMode::Softmax || (Input(0)->GetSampleLayout().GetNumElements() == 1 && positive > 0))
         {
             // evaluation uses softmax
-            m_logSoftmax.AssignProductOf(Input(1)->Value(), true, Input(2)->Value(), false);
+            m_logSoftmax.AssignProductOf(Input(1)->Value(), true, Input(2)->ValueAsMatrix(), false);
             m_logSoftmax += Input(3)->Value();
             m_logSoftmax.InplaceLogSoftmax(false);
             MaskMissingColumnsToZero(m_logSoftmax, Input(1)->GetMBLayout(), fr); // TODO: is this the right way to neutralize gaps?
             Value().AssignSoftmaxSum(Input(0)->Value(), m_logSoftmax);
         }
-        else if (m_evalMode == NCEEvalMode::Unnormalized || (Input(0)->GetNumRows() == 1 && negative > 0))
+        else if (m_evalMode == NCEEvalMode::Unnormalized || (Input(0)->GetSampleLayout().GetNumElements() == 1 && negative > 0))
         {
             // TODO: are we treating gaps correctly here?
-            Value().AssignNceUnnormalizedEval(Input(0)->Value(), Input(1)->Value(), Input(2)->Value(), Input(3)->Value());
+            Value().AssignNceUnnormalizedEval(Input(0)->Value(), Input(1)->Value(), Input(2)->ValueAsMatrix(), Input(3)->Value());
         }
         else
         {
             // TODO: are we treating gaps correctly here?
             // training criterion uses NCE
             //likelihood                                         samples+probs                        hidden                       embedding            bias
-            Value().AssignNoiseContrastiveEstimation(Input(0)->Value(), Input(1)->Value(), Input(2)->Value(), Input(3)->Value(), m_ncePrediction);
+            Value().AssignNoiseContrastiveEstimation(Input(0)->Value(), Input(1)->Value(), Input(2)->ValueAsMatrix(), Input(3)->Value(), m_ncePrediction);
         }
         m_needRecomputeGradientToSoftmaxInput = true;
     }
 
-    /**
-        Inputs: [0] label in dense matrix in [4 x T]
-        the first row is the word index, the second row is the class index, the third row is the first word index of the class
-        the last row is the first word index of the next class
-        [1] hidden layer activity to the node in [hdsize x T]. for a simple rnn, this is the hidden layer activty
-        [2] weight matrix in [hdsize x vocab_size], for speed-up, as per word matrix can be simply obtained as column slice
-        [3] clsprob in dense matrix in [nbr_cls x T]. this is the output from logsoftmax node for the log-posterior probabilty of class given observations
-        */
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
     {
         Base::Validate(isFinalValidationPass);
@@ -688,14 +687,13 @@ class NoiseContrastiveEstimationNode : public ComputationNodeNonLooping /*Comput
             LogicError("NoiseContrastiveEstimationNode criterion requires the first input to be the label.");
         if (isFinalValidationPass)
         {
-            if (!(Input(1)->GetNumRows() == Input(2)->GetNumRows())) // input and matrix can be timed
+            if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetAsMatrixNumRows())
                 LogicError("The Matrix dimension for observation and weight in the NoiseContrastiveEstimationNode operation does not match.");
-            if (!(Input(0)->GetNumCols() == Input(1)->GetNumCols())) // label and input same obs numbers
-                LogicError("The Matrix dimension for label and observation in the NoiseContrastiveEstimationNode operation does not match.");
+            if (!Input(0)->HasMBLayout() || !Input(1)->HasMBLayout() || Input(2)->HasMBLayout() || !Input(3)->HasMBLayout())
+                LogicError("%ls %ls operation requires inputs 0, 1, and 3 to be a minibatch, and input 2 to be a matrix.", NodeName().c_str(), OperationName().c_str());
         }
 
-        //cerr << Input(3)->GetNumCols() << "\t" << Input(0)->GetNumCols() << endl;
-        SetDims(TensorShape(1), 1);
+        SetDims(TensorShape(1), false);
     }
 
 protected:
@@ -719,16 +717,15 @@ template class NoiseContrastiveEstimationNode<float>;
 template class NoiseContrastiveEstimationNode<double>;
 
 // -----------------------------------------------------------------------
-/// ClassBasedCrossEntropyWithSoftmaxNode (labels(.,t), input(.,t), inputweights, clsProbBeforeSoftmax(.,t))
-// Inputs:
-// Input(0) [4 x T] label in dense matrix in
-//           (0,t) the first row is the word index
-//           (1,t) the second row is the class index
-//           (2,t) the third row is the first word index of the class
-//           (3,t) the last row is the first word index of the next class
-// Input(1) [hdsize x T] hidden layer activation to the node in. for a simple rnn, this is the hidden layer activty
-// Input(2) [hdsize x vocab_size] weight matrix in, for speed-up, as per word matrix can be simply obtained as column slice
-// Input(3) [nbr_cls x T] clsprob in dense matrix in. this input, if applied softmax on, is the posterior probabilty of class given observations
+// ClassBasedCrossEntropyWithSoftmaxNode (labels(.,t), input(.,t), inputweights, clsProbBeforeSoftmax(.,t))
+//  - Input(0) [4 x T] label in dense matrix in
+//              (0,t) the first row is the word index
+//              (1,t) the second row is the class index
+//              (2,t) the third row is the first word index of the class
+//              (3,t) the last row is the first word index of the next class
+//  - Input(1) [hdsize x T] hidden layer activation to the node in. for a simple rnn, this is the hidden layer activty
+//  - Input(2) [hdsize x vocab_size] weight matrix in, for speed-up, as per word matrix can be simply obtained as column slice
+//  - Input(3) [nbr_cls x T] clsprob in dense matrix in. this input, if applied softmax on, is the posterior probabilty of class given observations
 // -----------------------------------------------------------------------
 
 // calculates: -sum(left_i * log(softmax_i(right))) for class given history and for word given history
@@ -788,7 +785,7 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
                 size_t nbr_wrd = (rgt_bnd - lft_bnd);  // number of words in the class
 
                 // compute prb - 1 and prb
-                Matrix<ElemType> weightForClass = Input(2)->Value().ColumnSlice(lft_bnd, nbr_wrd);
+                Matrix<ElemType> weightForClass = Input(2)->ValueAsMatrix().ColumnSlice(lft_bnd, nbr_wrd);
                 Matrix<ElemType> obs = Input(1)->ValueFor(fr); // hidden activation vector for current word token
                 Matrix<ElemType> grd_to_soft_max_input = m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd);
                 Matrix<ElemType> grd_to_cls_prob = DataWithMBLayoutFor(m_clsLogSoftmax, fr, Input(3)->GetMBLayout());
@@ -802,7 +799,7 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
                     break;
                 case 2:
                     // gradient to input weight
-                    grd_to_wgt_t = Input(2)->Gradient().ColumnSlice(lft_bnd, nbr_wrd);
+                    grd_to_wgt_t = Input(2)->GradientAsMatrix().ColumnSlice(lft_bnd, nbr_wrd);
                     Matrix<ElemType>::MultiplyAndAdd(obs, false, grd_to_soft_max_input, false, grd_to_wgt_t);
                     break;
                 case 3:
@@ -880,10 +877,10 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
             LogicError("ClassBasedCrossEntropyWithSoftmax (ForwardPropNonLooping()): The label matrix is not using CPU device. This will make computation slow, even though the label data is probably saved on GPU. Because of the external loop over time with explicit class id retrieved from the label matrix, the computation will be very slow if the label matrix is saved on GPU. However, this is only a constraint for label matrix and other matrices such as data are suggested to reside on GPU. ");
 
         // (the below is left-over from refactoring)
-        Matrix<ElemType>& functionValues = Value();
+        auto functionValues = Value();
 
-        const size_t hdSize = Input(1)->GetNumRows(); // hdSize
-        assert(m_nbrCls == Input(3)->GetNumRows());
+        const size_t hdSize = Input(1)->GetSampleMatrixNumRows(); // hdSize
+        assert(m_nbrCls == Input(3)->GetSampleMatrixNumRows());
 
         // compute the class posteriors
         m_clsLogSoftmax = Input(3)->Value();
@@ -940,7 +937,7 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
                 // now get views of various arrays that correspond to the index range of words belonging to this class
 
                 // get hidden vectors for the words in this class
-                Matrix<ElemType> weightForClass = Input(2)->Value().ColumnSlice(lft_bnd, nbr_wrd); // [hdSize x nbr_wrd]
+                Matrix<ElemType> weightForClass = Input(2)->ValueAsMatrix().ColumnSlice(lft_bnd, nbr_wrd); // [hdSize x nbr_wrd]
 
                 // buffer to hold the class-conditional distribution
                 Matrix<ElemType> softMax_t = m_softMax.ColumnSlice(sz, nbr_wrd);
@@ -990,17 +987,17 @@ class ClassBasedCrossEntropyWithSoftmaxNode : public ComputationNodeNonLooping /
             LogicError("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
         if (isFinalValidationPass)
         {
-            if (Input(0)->GetNumRows() != 4) // label needs to be 4 rows
+            if (Input(0)->GetSampleMatrixNumRows() != 4) // label needs to be 4 rows
                 LogicError("The label in the ClassBasedCrossEntropyWithSoftmaxNode operation needs to be 4 rows.");
-            if (Input(1)->GetNumRows() != Input(2)->GetNumRows()) // input and matrix can be timed
+            if (Input(1)->GetSampleMatrixNumRows() != Input(2)->GetAsMatrixNumRows()) // input and matrix can be timed
                 LogicError("The Matrix<ElemType>  dimension for observation and weight in the ClassBasedCrossEntropyWithSoftmaxNode operation does not match.");
             if (Input(0)->GetMBLayout() != Input(1)->GetMBLayout() || Input(0)->GetMBLayout() != Input(3)->GetMBLayout())
                 InvalidArgument("%ls %ls operation requires that the layouts of inputs 0 (label), 1 (hidden activation), and 3 (log softmax) match.", NodeName().c_str(), OperationName().c_str());
         }
 
-        SetDims(TensorShape(1), 1);
+        SetDims(TensorShape(1), false);
 
-        m_nbrCls = Input(3)->GetNumRows();
+        m_nbrCls = Input(3)->GetSampleMatrixNumRows();
     }
 
 protected:
@@ -1025,10 +1022,10 @@ template class ClassBasedCrossEntropyWithSoftmaxNode<double>;
 
 // -----------------------------------------------------------------------
 // CRFNode (labels, position_dependent_scores, transition_scores)
-//  - labels : output label vector of [0:T-1]
-//  - position_dependent_scores [?] : score from position dependent node,
+//  - labels: output label vector of [0:T-1]
+//  - position_dependent_scores [0:T-1]: score from position dependent node,
 //    in the R-CRF case, it is the RNN output score before softmax
-//  - transition scores [?] : score from the transition node,
+//  - transition scores: square transition matrix,  --TODO: log?
 //    in the R-CRF case, it is the transition probability between labels
 // BUGBUG: This node cannot operate with truncated BPTT, but does not detect it. It also does not handle gaps or test boundary flags.
 // -----------------------------------------------------------------------
@@ -1072,8 +1069,8 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
         FrameRange fr(Input(0)->GetMBLayout());
-        size_t nrow = Input(0)->GetNumRows();
-        size_t ncol = Input(0)->GetNumCols();
+        size_t nrow = Input(0)->Value().GetNumRows();
+        size_t ncol = Input(0)->Value().GetNumCols();
 
         mAlpha.Resize(nrow, ncol);
         mBeta.Resize(nrow, ncol);
@@ -1088,7 +1085,7 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         for (size_t i = 0; i < nS; i++) // process parallel sequences one by one  --BUGBUG: We should loop over individual sequences.
         {
             FrameRange sequenceRange = fr.Sequence(i); // FrameRange to select one sequence
-            // BUGBUG: This ^^ is neither supported nor correct, since this code does not handle gaps or start/end flags
+            // BUGBUG: This ^^ is neither supported nor correct, since this code does not handle gaps or start/end flags.
             ForwardPropS(
                 DataWithMBLayoutFor(mPostProb, sequenceRange, Input(0)->GetMBLayout()),
                 DataWithMBLayoutFor(mAlpha, sequenceRange, Input(0)->GetMBLayout()),
@@ -1096,7 +1093,7 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
                 funcVal,
                 Input(0)->ValueFor(sequenceRange),
                 Input(1)->ValueFor(sequenceRange),
-                Input(2)->Value(), mStartLbl,
+                Input(2)->ValueAsMatrix(), mStartLbl,
                 mEndLbl);
 
             Value() += funcVal; // aggregate over sequences
@@ -1122,11 +1119,11 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
             for (size_t i = 0; i < nS; i++) // process all sequences one by one
             {
                 FrameRange sequenceRange = fr.Sequence(i); // FrameRange to select one sequence
-                auto gradient = Input(2)->GradientFor(fr);
+                auto gradient = Input(2)->GradientAsMatrix();
                 TransGrdCompute(Input(0)->ValueFor(sequenceRange),
                                 DataWithMBLayoutFor(mAlpha, sequenceRange, Input(0)->GetMBLayout()),
                                 DataWithMBLayoutFor(mBeta, sequenceRange, Input(0)->GetMBLayout()),
-                                Input(2)->ValueFor(fr),
+                                Input(2)->ValueAsMatrix(),
                                 gradient,
                                 mStartLbl, 1);
             }
@@ -1287,15 +1284,16 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
         m_pMBLayout = nullptr; // this node does not hold mini-batch data
 
         if (isFinalValidationPass)
-            if (!(Input(1)->GetNumRows() == Input(2)->GetNumRows() && // position dependent and pair scores have same number of labels
-                  Input(0)->GetNumRows() == Input(1)->GetNumRows() &&
-                  Input(0)->GetNumCols() == Input(1)->GetNumCols() && // position dependent and pair scores have the same observation numbers
-                  Input(2)->GetNumCols() == Input(2)->GetNumRows()))
+            if (!(Input(1)->GetSampleMatrixNumRows() == Input(2)->GetAsMatrixNumRows() && // position dependent and pair scores have same number of labels
+                  Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() &&
+                  Input(0)->HasMBLayout() && Input(0)->GetMBLayout() == Input(1)->GetMBLayout() &&
+                  //Input(0)->GetNumCols() == Input(1)->GetNumCols() && // position dependent and pair scores have the same observation numbers
+                  Input(2)->GetAsMatrixNumCols() == Input(2)->GetAsMatrixNumRows()))
             {
                 LogicError("The Matrix dimension in the CRFNode operation does not match.");
             }
 
-        SetDims(TensorShape(1), 1);
+        SetDims(TensorShape(1), false);
     }
 
     virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -1323,10 +1321,10 @@ class CRFNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>,
 
 // -----------------------------------------------------------------------
 /// SequenceWithSoftmaxNode (label, prediction, loglikelihood)
-// word-lattice based sequence training criterion
-// BUGBUG: Likely not very useful since it uses an MS-proprietary lattice-archive format
-//         that requires Frank's DBN.exe tool to create. The inner C++ code for conversion
-//         is in this repo (latticearchive.h), but not the outer main program.
+// word-lattice based sequence training criterion, using a Microsoft-proprietary lattice format
+// This node is likely not very useful for external use since it uses an MS-proprietary lattice-archive format
+// that requires Frank's DBN.exe tool to create. The inner C++ code for converting HTK lattices
+// into this format is in this repo (latticearchive.h), but not the outer main program.
 // -----------------------------------------------------------------------
 
 template <class ElemType>
@@ -1436,7 +1434,7 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping<ElemType>, publ
         m_softmaxOfRight->InplaceExp();
 
         m_gammaFromLattice->SwitchToMatrixType(m_softmaxOfRight->GetMatrixType(), m_softmaxOfRight->GetFormat(), false);
-        m_gammaFromLattice->Resize(m_softmaxOfRight->GetNumRows(), m_softmaxOfRight->GetNumCols());
+        m_gammaFromLattice->Resize(*m_softmaxOfRight);
         m_gammaCalculator.calgammaformb(Value(), m_lattices, Input(2)->Value() /*log LLs*/,
                                         Input(0)->Value() /*labels*/, *m_gammaFromLattice,
                                         m_uids, m_boundaries, Input(1)->GetNumParallelSequences(),
@@ -1459,15 +1457,16 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping<ElemType>, publ
             LogicError("SequenceWithSoftmaxNode criterion requires the first input to be the label.");
 
         if (isFinalValidationPass)
-            if (!(Input(0)->GetNumRows() == Input(1)->GetNumRows() && //match size
-                  Input(1)->GetNumRows() == Input(2)->GetNumRows() &&
-                  Input(0)->GetNumCols() == Input(1)->GetNumCols() &&
-                  Input(1)->GetNumCols() == Input(2)->GetNumCols()))
+            if (!(Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() && //match size
+                  Input(1)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() &&
+                  Input(0)->HasMBLayout() &&
+                  Input(0)->GetMBLayout() == Input(1)->GetMBLayout() &&
+                  Input(0)->GetMBLayout() == Input(2)->GetMBLayout()))
             {
                 LogicError("The Matrix dimension in the SequenceWithSoftmaxNode operation does not match.");
             }
 
-        SetDims(TensorShape(1), 1);
+        SetDims(TensorShape(1), false);
 
         m_gammatime = 0;
         m_partialtime = 0;
@@ -1623,7 +1622,7 @@ class LogisticNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemTy
         m_temp->AssignDifferenceOf(Input(0)->ValueFor(fr), *m_classZeroLabels); // TODO: need a slice for m_classZeroLabels?
 
         // Multiply the vector by the Input(2)->Value()
-        if (m_inputs.size() == 3)                                            // without weight
+        if (m_inputs.size() == 3)                                            // with weight
             m_temp->AssignElementProductOf(*m_temp, Input(2)->ValueFor(fr)); // TODO: is Input(2) minibatch data? Confirm
 
         // divide class by p (class 1) or (1-p) (class 0)
@@ -1640,12 +1639,12 @@ class LogisticNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemTy
 
     virtual void UpdateFunctionMBSize() override
     {
-        m_classZeroLabels->Resize(Input(0)->GetNumRows(), Input(0)->GetNumCols());
-        m_result->Resize(Input(0)->GetNumRows(), Input(0)->GetNumCols());
-        m_temp->Resize(Input(0)->GetNumRows(), Input(0)->GetNumCols());
+        m_classZeroLabels->Resize(Input(0)->Value());
+        m_result->Resize(Input(0)->Value());
+        m_temp->Resize(Input(0)->Value());
     }
 
-    //-sum(left * log(right) + (1-left)*log(1-right)) (optionally * weight)
+    // -sum(left * log(right) + (1-left)*log(1-right)) (optionally * weight)
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
         FrameRange fr(Input(0)->GetMBLayout());
@@ -1695,19 +1694,16 @@ class LogisticNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemTy
         /* Note that this is the same as ValidateInferBinaryInputDims, but done for the 3rd child if it exists */
         if (m_inputs.size() == 3)
         {
-            auto in = Input(2);
+            auto weights = Input(2);
             auto other = Input(1);
             // borrow any unset dimension on one input from the other input
-            size_t rows = in->GetNumRows() == 0 ? other->GetNumRows() /*borrow from peer*/ : in->GetNumRows() /*keep as is*/;
-            size_t cols = (!in->HasMBLayout() && in->GetNumCols() == 0) ? other->GetNumCols() /*borrow from peer*/ : in->GetNumCols() /*keep as is*/;
-
-            ValidateInferInputDims(2, rows, cols);
+            weights->ValidateInferInputDimsFrom(other->GetSampleLayout());
 
             if (isFinalValidationPass &&
-                !(Input(0)->GetNumRows() == Input(2)->GetNumRows() &&
-                  (Input(0)->HasMBLayout() || (Input(0)->GetNumCols() == Input(2)->GetNumCols()))))
+                !(Input(0)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() &&
+                 (Input(0)->GetMBLayout() == Input(2)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(0)->HasMBLayout())))
             {
-                LogicError("The Matrix dimensions of the second argument in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+                LogicError("The Matrix dimensions of the second argument weights the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
             }
         }
     }
diff --git a/Source/EvalDll/CNTKEval.cpp b/Source/EvalDll/CNTKEval.cpp
index da248c3d67a8..6336a31673ee 100644
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@@ -103,7 +103,7 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen
         for (auto& node : nodes)
         {
             std::wstring name = node->NodeName();
-            size_t size = node->GetNumRows();
+            size_t size = node->GetSampleMatrixNumRows();
             dimensions[name] = size;
         }
         break;
@@ -114,7 +114,7 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen
         for (auto& node : nodes)
         {
             std::wstring name = node->NodeName();
-            size_t size = node->GetNumRows();
+            size_t size = node->GetSampleMatrixNumRows();
             dimensions[name] = size;
         }
         break;
@@ -123,7 +123,7 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen
         for (auto iter = dimensions.begin(); iter != dimensions.end(); iter++)
         {
             auto node = m_net->GetNodeFromName(iter->first);
-            iter->second = node->GetNumRows();
+            iter->second = node->GetSampleMatrixNumRows();
         }
         break;
     }
diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h
index 42fef1607f10..610578400b8b 100644
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@@ -24,7 +24,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         // Note: This will go away with the redesigned reader interface.
         // TODO: callers of this often do ComputationNetwork::BumpEvalTimeStamp(featureNodes) and also for labels; we should eliminate the need for this.
-    template <class ElemType>
+        template <class ElemType>
         static bool GetMinibatchIntoNetwork(IDataReader<ElemType>& trainSetDataReader,
             ComputationNetworkPtr net,
             ComputationNodeBasePtr criterionNode,
@@ -41,26 +41,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             //  - VerifyActualNumParallelSequences()  --(refactoring left-over) verify that MBLayout is consistent with #parallel sequences
             // with the special twist that in presence of parallelization, there is some decimation involved.
 
-        bool wasDataRead = trainSetDataReader.GetMinibatch(inputMatrices); // fill in the minibatch data into the Input nodes' buffers directly
+            bool wasDataRead = trainSetDataReader.GetMinibatch(inputMatrices); // fill in the minibatch data into the Input nodes' buffers directly
             // If this returns false, the matrices may contain garbage or not sized to 0 columns.
             // On the other hand, if it returns a 0-column matrix, that would be a perfectly cromulent minibatch (in case of data parallelism with distributed reading).
 
-        if (wasDataRead)
-        {
-            trainSetDataReader.CopyMBLayoutTo(pMBLayout); // get layout meta-data
-
-            // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
-            // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
-            std::set<Matrix<ElemType>*> matrices;
-            for (const auto & iter : inputMatrices)
-                matrices.insert(iter.second);
-            for (auto & node : net->FeatureNodes())
-                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
-                    node->NotifyFunctionValuesMBSizeModified();
-            for (auto & node : net->LabelNodes())
-                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
-                    node->NotifyFunctionValuesMBSizeModified();
-        }
+            // if no data read then we are done
+            if (!wasDataRead)
+                return false;
 
             // get some additional information when doing sequence training
             // TODO: This should not need to be called in case of wasDataRead == false, since in that case, returned values are invalid.
@@ -75,19 +62,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 trainSetDataReader.GetMinibatch4SE(*latticeinput, *uids, *boundaries, *extrauttmap);
             }
 
-            // if no data read then we are done
-            if (!wasDataRead)
-                return false;
+            // get layout meta-data
+            trainSetDataReader.CopyMBLayoutTo(pMBLayout);
 
             // decimate if needed. Decimation happens in-place.
             if (!useDistributedMBReading && useParallelTrain)
-            {
                 DecimateMinibatch(inputMatrices, g_mpi->NumNodesInUse(), g_mpi->CurrentNodeRank(), net->GetMBLayoutPtr());
-            net->NotifyInputNodesFunctionValuesMBSizeModified(); // #matrix columns changes
-            }
+
+            // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
+            // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
+            std::set<Matrix<ElemType>*> matrices;
+            for (const auto & iter : inputMatrices)
+                matrices.insert(iter.second);
+            for (auto & node : net->FeatureNodes())
+                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
+                    node->NotifyFunctionValuesMBSizeModified();
+            for (auto & node : net->LabelNodes())
+                if (matrices.find(&node->As<ComputationNode<ElemType>>()->Value()) != matrices.end())
+                    node->NotifyFunctionValuesMBSizeModified();
 
             // get MB size and tell Network to update its nodes' buffers based on what's in the input matrices
             // Note: Decimation may have reduced this to 0 frames. We still must return 'true'.
+            // BUGBUG: This has a definitional problem once we support multiple feature streams with different lenghts.
             actualMBSize = net->DetermineActualMBSizeFromFeatures();
 
             return true;
@@ -97,19 +93,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // DecimateMinibatch - decimate minibatch for parallelization
         // -------------------------------------------------------------------
         // non-inplace decimation , to be used in subminibatch implementation 
-        // return [st, en) parallel sequence which has been selected after decimation 
-    template <class ElemType>
-    static pair<size_t, size_t> DecimateMinibatch(const std::map<std::wstring, Matrix<ElemType>*> MB,     // input matrices
-                                                  std::map<std::wstring, Matrix<ElemType>*>& decimatedMB, // output decimated matrices.
-                                                  MBLayoutPtr pMBLayout,                                  // input MBLayout
-                                                  MBLayoutPtr& pDecimateMBLayout,                         // output decimated MBLayout (note: cannot work in-place)
-                                      int numWorker, int rank)
+        // returns a subset of parallel sequences
+        template <class ElemType>
+        static pair<size_t, size_t> DecimateMinibatch(const std::map<std::wstring, Matrix<ElemType>*> MB,     // input matrices
+                                                      std::map<std::wstring, Matrix<ElemType>*>& decimatedMB, // output decimated matrices.
+                                                      MBLayoutPtr pMBLayout,                                  // input MBLayout
+                                                      MBLayoutPtr& pDecimateMBLayout,                         // output decimated MBLayout (note: cannot work in-place)
+                                                      int numWorker, int rank)
         {
             size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
             size_t nT = pMBLayout->GetNumTimeSteps();
 
             // decide start column and end column 
-        size_t st = numParallelSequences * (size_t) rank / numWorker;
+            size_t st = numParallelSequences * (size_t)rank / numWorker;
             size_t en = numParallelSequences * (size_t)(rank + 1) / numWorker;
             en = en > numParallelSequences ? numParallelSequences : en; // TODO: why are these two tests necessary?
             en = (rank == numWorker - 1) ? numParallelSequences : en;
@@ -143,8 +139,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT);
 #if 1
             // now copy over all sequence info records that are inside the range, with adjusted 's'
-        const auto& sequences = pMBLayout->GetAllSequences();
-        for (const auto& seq : sequences)
+            const auto& sequences = pMBLayout->GetAllSequences();
+            for (const auto& seq : sequences)
             {
                 if (seq.s >= st && seq.s < en)
                 {
@@ -163,11 +159,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // in-place decimation, for use with data-parallel processing
-       // return [st, en) parallell sequence which has been selected after decimation 
-    template <class ElemType>
-    static pair<size_t, size_t> DecimateMinibatch(std::map<std::wstring, Matrix<ElemType>*>& mb, // matrix to be decimated
-                                                  int numprocs, int rank,                        // rank info
-                                                  MBLayoutPtr pMBLayout)                         // get decimated as well
+        // returns a subset of parallel sequences
+        template <class ElemType>
+        static pair<size_t, size_t> DecimateMinibatch(std::map<std::wstring, Matrix<ElemType>*>& mb, // matrix to be decimated
+                                                      int numprocs, int rank,                        // rank info
+                                                      MBLayoutPtr pMBLayout)                         // get decimated as well
         {
             if (numprocs == 1)
                 return pair<size_t, size_t>(0, pMBLayout->GetNumParallelSequences());
@@ -177,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             MBLayoutPtr pDecimatedMB = make_shared<MBLayout>();
             std::map<wstring, Matrix<ElemType>*> decimatedMB;
             // call in-place decimation 
-        pair<size_t, size_t> selected = DecimateMinibatch(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
+            pair<size_t, size_t> selected = DecimateMinibatch(mb, decimatedMB, pMBLayout, pDecimatedMB, numprocs, rank);
             // move the data 
             for (auto k : mb)
             {
@@ -211,8 +207,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //        }
 
     template <class ElemType>
-        class SubminibatchDispatcher
-        {
+    class SubminibatchDispatcher
+    {
         private:
         typedef std::vector<shared_ptr<const msra::dbn::latticesource::latticepair>> Lattice;
         typedef std::vector<size_t> Uid;
diff --git a/Source/SGDLib/MultiNetworksEvaluator.h b/Source/SGDLib/MultiNetworksEvaluator.h
index 16dd6f77b6e2..55554da8777a 100644
--- a/Source/SGDLib/MultiNetworksEvaluator.h
+++ b/Source/SGDLib/MultiNetworksEvaluator.h
@@ -311,7 +311,7 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
             for (auto ptr = decoderEvaluationNodes.begin(); ptr != decoderEvaluationNodes.end(); ptr++, i++)
             {
                 decoderNet->ForwardProp(*ptr);
-                if ((*ptr)->GetNumRows() != 1 || (*ptr)->GetNumCols() != 1)
+                if ((*ptr)->GetSampleLayout().GetNumElements() != 1)
                     LogicError("EvaluateEncoderDecoderWithHiddenStates: decoder evaluation should return a scalar value");
 
                 evalResults += (double) (*ptr)->Get00Element();
@@ -597,7 +597,7 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         {
             ComputationNodeBasePtr node = *nodeIter;
             node->ForwardProp(FrameRange(node->GetMBLayout(), atTime));
-            if (node->GetNumCols() != node->GetNumParallelSequences())
+            if (node->GetSampleMatrixNumCols() != node->GetNumParallelSequences())
                 RuntimeError("preComputeActivityAtTime: the function values has to be a single column matrix ");
         }
     }
@@ -650,7 +650,9 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
         size_t bSize = best_path.size();
         for (int i = 0; i < outputNodes.size(); i++)
         {
+#if 0       // This call no longer exists. This must be updated to make it functional again.
             outputNodes[i]->SetNumCols(bSize);
+#endif
             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[i])->UpdateFunctionValuesSize();
             dynamic_pointer_cast<ComputationNode<ElemType>>(outputNodes[i])->Value().SetValue(0);
             for (int k = 0; k < bSize; k++)
@@ -781,8 +783,10 @@ class MultiNetworksEvaluator : public SimpleEvaluator<ElemType>
 
         /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
         /// is the begining of sentence
+#if 0       // This call no longer exists. This must be updated to make it functional again.
         for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++)
             (*ptr)->SetNumCols(1);
+#endif
         // TODO: ^^ this is the same as ResizeAllFeatureNodes() if featureNodes == evalnet.FeatureNodes(). Is it?
         //evalnet->SetActualMiniBatchSizeFromFeatures();
 
diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
index beaf3d84ebd1..e0bf8a2e4612 100644
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@@ -256,11 +256,13 @@ class MultiNetworksSGD : SGD<ElemType>
             learnableNodes.push_back(*nodeIter);
 
         std::list<Matrix<ElemType>> smoothedGradients;
+#if 0   // No longer functional due to lack of GetNumCols().
         for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
         {
             ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
             smoothedGradients.push_back(Matrix<ElemType>(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId()));
         }
+#endif
 
         vector<double> epochCriterion;
         double avgCriterion, prevCriterion;
@@ -554,11 +556,13 @@ class MultiNetworksSGD : SGD<ElemType>
         }
 
         std::list<Matrix<ElemType>> smoothedGradients;
+#if 0   // No longer functional due to lack of GetNumCols().
         for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
         {
             ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
             smoothedGradients.push_back(Matrix<ElemType>(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId()));
         }
+#endif
 
         double epochCriterion, avgCriterion, prevCriterion;
         epochCriterion = std::numeric_limits<double>::infinity();
@@ -1029,8 +1033,8 @@ class MultiNetworksSGD : SGD<ElemType>
                 for (size_t itry = 0; itry < min((size_t) 10, node->Value().GetNumElements()); itry++)
                 {
 
-                    int irow = (int) fmod(rand(), node->GetNumRows() - 1);
-                    int icol = (int) fmod(rand(), node->GetNumCols() - 1);
+                    int irow = (int) fmod(rand(), node->Value().GetNumRows() - 1);
+                    int icol = (int) fmod(rand(), node->Value().GetNumCols() - 1);
                     irow = max(0, irow);
                     icol = max(0, icol);
 
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 718ad50334da..98c26832d705 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -230,8 +230,8 @@ template <class ElemType>
         for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
         {
             ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-            smoothedGradients.push_back(Matrix<ElemType>(node->GetNumRows(),
-                                                         node->GetNumCols(),
+            smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
+                                                         node->Value().GetNumCols(),
                                                          net->GetDeviceId()));
         }
 
@@ -1363,7 +1363,7 @@ template <class ElemType>
         fprintf(stderr, "\nPrecomputing --> %lu PreCompute nodes found.\n\n", nodes.size());
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
-            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
             fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
         }
 
@@ -1381,7 +1381,7 @@ template <class ElemType>
         // initialize
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
-            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
         node->MarkComputed(false /*begin accumulating*/);
         }
         size_t actualMBSizeDummy;
@@ -1396,7 +1396,7 @@ template <class ElemType>
         // finalize
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
-            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            auto node = static_pointer_cast<PreComputedNodeBase<ElemType>>(*nodeIter);
         node->MarkComputed(true /*done accumulating*/);
         }
         fprintf(stderr, "\nPrecomputing --> Completed.\n\n");
@@ -2336,8 +2336,8 @@ template <class ElemType>
         for (size_t itry = 0; itry < min((size_t) 50, node->Value().GetNumElements()); itry++)
             {
                 /// no support to sparse matrix yet
-                int irow = (int) fmod(rand(), node->GetNumRows() - 1);
-                int icol = (int) fmod(rand(), node->GetNumCols() - 1);
+                int irow = (int) fmod(rand(), node->Gradient().GetNumRows() - 1);
+                int icol = (int) fmod(rand(), node->Gradient().GetNumCols() - 1);
                 irow = max(0, irow);
                 icol = max(0, icol);
 
diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h
index 7a496e698179..439a04caa465 100644
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@@ -56,8 +56,7 @@ class SimpleEvaluator
                 const auto& node = m_net->GetNodeFromName(evalNodeNames[i]);
                 if (!criteriaLogged.insert(node).second)
                     continue;
-                //m_net->BuildAndValidateSubNetwork(node);
-                if (node->GetNumRows() != 1 || node->GetNumCols() != 1)
+                if (node->GetSampleLayout().GetNumElements() != 1)
                     InvalidArgument("Criterion nodes to evaluate must have dimension 1x1.");
                 evalNodes.push_back(node);
             }
diff --git a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
index d932946725bc..b118c43b6f8c 100644
--- a/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
+++ b/Tests/EndToEndTests/Speech/README_Windows_Debug_commands.txt
@@ -11,7 +11,7 @@ Linux:      bin/cntk  currentDirectory=Tests/EndToEndTests/Speech/Data  configFi
 
 --- Speech\Simple:
 
-COMMAND:    currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=..\Simple\cntk.config  RunDir=../RunDir/Simple  stderr=../RunDir/Simple/models/cntkSpeech.dnn.log  DataDir=$(SolutionDir)Tests\EndToEndTests\Speech\Data  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Speech\Simple  DeviceId=auto  makeMode=false
+COMMAND:    currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=..\Simple\cntk.config  RunDir=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\Simple  stderr=../RunDir/Simple/models/cntkSpeech.dnn.log  DataDir=$(SolutionDir)Tests\EndToEndTests\Speech\Data  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Speech\Simple  DeviceId=auto  makeMode=false
 
 --- Speech\LSTM\Truncated:
 
@@ -37,9 +37,9 @@ Using full BrainScript configuration
 
 COMMAND:     --cd $(SolutionDir)Tests\EndToEndTests\Speech\Data  -f $(SolutionDir)Tests\EndToEndTests\Speech\LSTM\lstm.bs  -D stderr='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance\models\cntkSpeech.dnn.log'  -D RunDir='$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\LSTM\FullUtterance'  -D NdlDir='$(SolutionDir)Tests\EndToEndTests\Speech\LSTM'  -D DataDir='.'  -D DeviceId='Auto'  -D Truncated=false  -D speechTrain=[reader=[nbruttsineachrecurrentiter=1];SGD=[epochSize=2560;maxEpochs=2;numMBsToShowResult=1]]  -D makeMode=false
 
---- Speech\DiscriminativePreTraining:
+--- Speech\DiscriminativePreTraining:  --currently fails with MEL error 'Parameter name could not be resolved 'HL2.y'
 
-COMMAND:     currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=..\DNN\DiscriminativePreTraining\cntk_dpt.config  stderr=..\RunDir\DNN\DiscriminativePreTraining\models\cntkSpeech.dnn.log  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Speech\DNN\DiscriminativePreTraining  RunDir=..\RunDir\DNN\DiscriminativePreTraining  DataDir=.  DeviceId=auto  makeMode=false
+COMMAND:     currentDirectory=$(SolutionDir)Tests\EndToEndTests\Speech\Data  configFile=..\DNN\DiscriminativePreTraining\cntk_dpt.config  stderr=$(SolutionDir)Tests\EndToEndTests\Speech\RunDir\DNN\DiscriminativePreTraining\models\cntkSpeech.dnn.log  ConfigDir=$(SolutionDir)Tests\EndToEndTests\Speech\DNN\DiscriminativePreTraining  RunDir=..\RunDir\DNN\DiscriminativePreTraining  DataDir=.  DeviceId=auto  makeMode=false
 
 --- Speech\SequenceTraining: