From 06e38a9fbdade93413f69f026ff16484c55d1682 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 9 Mar 2016 09:47:52 -0800
Subject: [PATCH 01/26] refactored node formatting from SimpleOutputWriter into
 ComputationNode, for uas as a debugging aid. No code change other than
 refactoring-related

---
 .../ComputationNetworkLib/ComputationNode.cpp | 103 +++++++++++++++++
 .../ComputationNetworkLib/ComputationNode.h   |   5 +
 Source/SGDLib/SimpleOutputWriter.h            | 106 ++----------------
 3 files changed, 118 insertions(+), 96 deletions(-)
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 4c57c58380e1..d983399903bc 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -229,6 +229,109 @@ template <class ElemType>
     }
 }
 
+// write out the content of a node in formatted/readable form
+template <class ElemType>
+void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
+                                                             const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator,
+                                                             const string& valueFormatString) const
+{
+    // get it (into a flat CPU-side vector)
+    const Matrix<ElemType>& outputValues = Value();
+    size_t tempArraySize = 0;
+    ElemType* tempArray = nullptr;
+    outputValues.CopyToArray(tempArray, tempArraySize);
+
+    // process all sequences one by one
+    auto pMBLayout = GetMBLayout();
+    if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?)
+    {
+        pMBLayout = make_shared<MBLayout>();
+        pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
+        // TODO: This can be done more efficiently, if ever needed.
+    }
+    const auto& sequences = pMBLayout->GetAllSequences();
+    size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next
+    size_t width = pMBLayout->GetNumTimeSteps();
+    for (size_t s = 0; s < sequences.size(); s++)
+    {
+        const auto& seqInfo = sequences[s];
+        size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0;
+        size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width;
+
+        // current sequence is a matrix with 'colStride' beginning at the following pointer
+        ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin;
+
+        if (s > 0)
+            fprintfOrDie(f, "%s", sequenceSeparator.c_str());
+        fprintfOrDie(f, "%s", sequencePrologue.c_str());
+
+        // output it according to our format specification
+        let formatChar = valueFormatString.back();
+        size_t dim = outputValues.GetNumRows();
+        size_t T = tEnd - tBegin;
+        if (isCategoryLabel)
+        {
+            if (formatChar == 's') // verify label dimension
+            {
+                if (outputValues.GetNumRows() != labelMapping.size())
+                    InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)dim, (int)labelMapping.size());
+            }
+            // update the matrix in-place from one-hot (or max) to index
+            // find the max in each column
+            for (size_t j = 0; j < T; j++)
+            {
+                double maxPos = -1;
+                double maxVal = 0;
+                for (size_t i = 0; i < dim; i++)
+                {
+                    double val = pCurValue[i + j * dim * colStride];
+                    if (maxPos < 0 || val >= maxVal)
+                    {
+                        maxPos = (double)i;
+                        maxVal = val;
+                    }
+                }
+                pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place
+            }
+            dim = 1; // ignore remaining dimensions
+        }
+        size_t iend    = transpose ? dim : T;
+        size_t jend    = transpose ?   T : dim;
+        size_t istride = transpose ?         1 : colStride;
+        size_t jstride = transpose ? colStride : 1;
+        for (size_t j = 0; j < jend; j++)
+        {
+            if (j > 0)
+                fprintfOrDie(f, "%s", sampleSeparator.c_str());
+            for (size_t i = 0; i < iend; i++)
+            {
+                if (i > 0)
+                    fprintfOrDie(f, "%s", elementSeparator.c_str());
+                if (formatChar == 'f') // print as real number
+                {
+                    double dval = pCurValue[i * istride + j * jstride];
+                    fprintfOrDie(f, valueFormatString.c_str(), dval);
+                }
+                else if (formatChar == 'u') // print category as integer index
+                {
+                    unsigned int uval = (unsigned int)pCurValue[i * istride + j * jstride];
+                    fprintfOrDie(f, valueFormatString.c_str(), uval);
+                }
+                else if (formatChar == 's') // print category as a label string
+                {
+                    size_t uval = (size_t)pCurValue[i * istride + j * jstride];
+                    assert(uval < labelMapping.size());
+                    const char * sval = labelMapping[uval].c_str();
+                    fprintfOrDie(f, valueFormatString.c_str(), sval);
+                }
+            }
+        }
+        fprintfOrDie(f, "%s", sequenceEpilogue.c_str());
+    } // end loop over sequences
+
+    delete[] tempArray;
+}
+
 // -----------------------------------------------------------------------
 // instantiate the core class templates
 // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 30dca5b48d57..d788775c77bb 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -1495,10 +1495,15 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
     // -----------------------------------------------------------------------
 
     virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const;
+    // helper for SimpleOutWriter, living in here to be able to use in debugging
+    void WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
+                                      const std::string& sequenceSeparator, const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator, const std::string& sampleSeparator,
+                                      const std::string& valueFormatString) const;
 
 protected:
 
     // print node values
+    // This is used for dumping model parameters, not minibatch data.
     void PrintNodeValuesToFile(const bool printValues, const bool printMetadata, File& fstream) const
     {
         if (printValues)
diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h
index f4a0d824043d..c98e10119adf 100644
--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@@ -222,8 +222,6 @@ class SimpleOutputWriter
 
         size_t totalEpochSamples = 0;
         size_t numMBsRun = 0;
-        size_t tempArraySize = 0;
-        ElemType* tempArray = nullptr;
 
         for (auto & onode : outputNodes)
         {
@@ -231,9 +229,6 @@ class SimpleOutputWriter
             fprintfOrDie(f, "%s", formattingOptions.prologue.c_str());
         }
 
-        char formatChar = !formattingOptions.isCategoryLabel ? 'f' : !formattingOptions.labelMappingFile.empty() ? 's' : 'u';
-        std::string valueFormatString = "%" + formattingOptions.precisionFormat + formatChar; // format string used in fprintf() for formatting the values
-
         size_t actualMBSize;
         const size_t numIterationsBeforePrintingProgress = 100;
         size_t numItersSinceLastPrintOfProgress = 0;
@@ -247,10 +242,6 @@ class SimpleOutputWriter
                 // Note: Intermediate values are memoized, so in case of multiple output nodes, we only compute what has not been computed already.
                 m_net->ForwardProp(onode);
 
-                // get it (into a flat CPU-side vector)
-                Matrix<ElemType>& outputValues = dynamic_pointer_cast<ComputationNode<ElemType>>(onode)->Value();
-                outputValues.CopyToArray(tempArray, tempArraySize);
-
                 // sequence separator
                 FILE * f = *outputStreams[onode];
                 const auto sequenceSeparator = formattingOptions.Processed(onode->NodeName(), formattingOptions.sequenceSeparator);
@@ -259,91 +250,16 @@ class SimpleOutputWriter
                 const auto elementSeparator  = formattingOptions.Processed(onode->NodeName(), formattingOptions.elementSeparator);
                 const auto sampleSeparator   = formattingOptions.Processed(onode->NodeName(), formattingOptions.sampleSeparator);
 
-                // process all sequences one by one
-                auto pMBLayout = onode->GetMBLayout();
-                if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?)
-                {
-                    pMBLayout = make_shared<MBLayout>();
-                    pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
-                }
-                const auto& sequences = pMBLayout->GetAllSequences();
-                size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next
-                size_t width     = pMBLayout->GetNumTimeSteps();
-                for (size_t s = 0; s < sequences.size(); s++)
-                {
-                    const auto& seqInfo = sequences[s];
-                    size_t tBegin = seqInfo.tBegin >= 0     ? seqInfo.tBegin : 0;
-                    size_t tEnd   = seqInfo.tEnd   <= width ? seqInfo.tEnd   : width;
-
-                    // current sequence is a matrix with 'colStride' beginning at the following pointer
-                    ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin;
-
-                    if ((numMBsRun > 0 || s > 0) && !sequenceSeparator.empty())
-                        fprintfOrDie(f, "%s", sequenceSeparator.c_str());
-                    fprintfOrDie(f, "%s", sequencePrologue.c_str());
-
-                    // output it according to our format specification
-                    size_t dim = outputValues.GetNumRows();
-                    size_t T   = tEnd - tBegin;
-                    if (formattingOptions.isCategoryLabel)
-                    {
-                        if (formatChar == 's') // verify label dimension
-                        {
-                            if (outputValues.GetNumRows() != labelMapping.size())
-                                InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile '%ls'", (int)dim, (int)labelMapping.size(), formattingOptions.labelMappingFile.c_str());
-                        }
-                        // update the matrix in-place from one-hot (or max) to index
-                        // find the max in each column
-                        for (size_t j = 0; j < T; j++)
-                        {
-                            double maxPos = -1;
-                            double maxVal = 0;
-                            for (size_t i = 0; i < dim; i++)
-                            {
-                                double val = pCurValue[i + j * dim * colStride];
-                                if (maxPos < 0 || val >= maxVal)
-                                {
-                                    maxPos = (double)i;
-                                    maxVal = val;
-                                }
-                            }
-                            pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place
-                        }
-                        dim = 1; // ignore remaining dimensions
-                    }
-                    size_t iend    = formattingOptions.transpose ?      dim  : T;
-                    size_t jend    = formattingOptions.transpose ?         T : dim;
-                    size_t istride = formattingOptions.transpose ?         1 : colStride;
-                    size_t jstride = formattingOptions.transpose ? colStride : 1;
-                    for (size_t j = 0; j < jend; j++)
-                    {
-                        if (j > 0)
-                            fprintfOrDie(f, "%s", sampleSeparator.c_str());
-                        for (size_t i = 0; i < iend; i++)
-                        {
-                            if (i > 0)
-                                fprintfOrDie(f, "%s", elementSeparator.c_str());
-                            if (formatChar == 'f') // print as real number
-                            {
-                                double dval = pCurValue[i * istride + j * jstride];
-                                fprintfOrDie(f, valueFormatString.c_str(), dval);
-                            }
-                            else if (formatChar == 'u') // print category as integer index
-                            {
-                                unsigned int uval = (unsigned int) pCurValue[i * istride + j * jstride];
-                                fprintfOrDie(f, valueFormatString.c_str(), uval);
-                            }
-                            else if (formatChar == 's') // print category as a label string
-                            {
-                                size_t uval = (size_t) pCurValue[i * istride + j * jstride];
-                                assert(uval < labelMapping.size());
-                                const char * sval = labelMapping[uval].c_str();
-                                fprintfOrDie(f, valueFormatString.c_str(), sval);
-                            }
-                        }
-                    }
-                    fprintfOrDie(f, "%s", sequenceEpilogue.c_str());
-                } // end loop over sequences
+                char formatChar = !formattingOptions.isCategoryLabel ? 'f' : !formattingOptions.labelMappingFile.empty() ? 's' : 'u';
+                std::string valueFormatString = "%" + formattingOptions.precisionFormat + formatChar; // format string used in fprintf() for formatting the values
+
+                if (numMBsRun > 0) // WriteMinibatchWithFormatting() will not include this before first sequence
+                    fprintfOrDie(f, "%s", sequenceSeparator.c_str());
+
+                auto pnode = dynamic_pointer_cast<ComputationNode<ElemType>>(onode);
+                pnode->WriteMinibatchWithFormatting(f, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping,
+                                                    sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator,
+                                                    valueFormatString);
             } // end loop over nodes
 
             totalEpochSamples += actualMBSize;
@@ -372,8 +288,6 @@ class SimpleOutputWriter
             fprintfOrDie(f, "%s", formattingOptions.epilogue.c_str());
         }
 
-        delete[] tempArray;
-
         fprintf(stderr, "Written to %ls*\nTotal Samples Evaluated = %lu\n", outputPath.c_str(), totalEpochSamples);
 
         // flush all files (where we can catch errors) so that we can then destruct the handle cleanly without error

From ba2238d2159e1cedc118e932255d565246668d70 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 9 Mar 2016 12:55:37 -0800
Subject: [PATCH 02/26] added an SGD option for basic tracing that to dump
 minibatches computed by nodes, using the same code as SimpleOutputWriter

---
 .../ComputationNetworkLib/ComputationNode.cpp | 24 ++++++++++++++-----
 .../ComputationNetworkLib/ComputationNode.h   | 22 ++++++++++++++++-
 Source/SGDLib/SGD.cpp                         |  9 ++++++-
 Source/SGDLib/SGD.h                           |  8 ++++++-
 Source/SGDLib/SimpleOutputWriter.h            |  2 +-
 5 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index d983399903bc..d67e2be995b9 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -231,7 +231,7 @@ template <class ElemType>
 
 // write out the content of a node in formatted/readable form
 template <class ElemType>
-void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
+void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
                                                              const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator,
                                                              const string& valueFormatString) const
 {
@@ -295,19 +295,31 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, bool trans
             }
             dim = 1; // ignore remaining dimensions
         }
-        size_t iend    = transpose ? dim : T;
-        size_t jend    = transpose ?   T : dim;
-        size_t istride = transpose ?         1 : colStride;
-        size_t jstride = transpose ? colStride : 1;
+        let iend    = transpose ?         dim : T;         // true dimension of the data to print
+        let jend    = transpose ?           T : dim;
+        let istop   = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices)
+        let jstop   = transpose ?   onlyUpToT : onlyUpToRow;
+        let istride = transpose ?           1 : colStride;
+        let jstride = transpose ?   colStride : 1;
         for (size_t j = 0; j < jend; j++)
         {
             if (j > 0)
                 fprintfOrDie(f, "%s", sampleSeparator.c_str());
+            if (j == jstop)
+            {
+                fprintf(f, "..."); // 'nuff said
+                break;
+            }
             for (size_t i = 0; i < iend; i++)
             {
                 if (i > 0)
                     fprintfOrDie(f, "%s", elementSeparator.c_str());
-                if (formatChar == 'f') // print as real number
+                if (i == istop)
+                {
+                    fprintf(f, "...");
+                    break;
+                }
+                else if (formatChar == 'f') // print as real number
                 {
                     double dval = pCurValue[i * istride + j * jstride];
                     fprintfOrDie(f, valueFormatString.c_str(), dval);
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index d788775c77bb..86d310c0379e 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -162,6 +162,15 @@ struct ComputationNetworkOwnedNodeState
     virtual void MarkValueSharable() { m_valueSharable = true; }
     bool IsValueSharable() const { return m_valueSharable; }
 
+    // tracing flags
+    // Enable to print the value of the function-value matrix in somewhat readable format.
+    // These are public since you are meant to set these flags manually in the debugger or temporarily poke into them from code as needed.
+    bool m_traceNodeValue = false;
+    bool m_traceNodeValueAsCategoryLabel = false;
+    size_t m_traceNodeValueUpToDim = 5;
+    size_t m_traceNodeValueUpToT = 5;
+    void EnableNodeTracing(bool isCategoryLabel) { m_traceNodeValue = true; m_traceNodeValueAsCategoryLabel = isCategoryLabel; }
+
 protected:                // TODO: should be fully encapsulated here
 
     bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
@@ -1306,6 +1315,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
         Value().Print(msra::strfun::utf8(NodeName()), 0, min(Value().GetNumRows()-1, 4), 0, min(Value().GetNumCols()-1, 4));
 #endif
         InvalidateMissingValueColumns(FrameRange(m_pMBLayout)); // blast NaNs into columns that are gaps in a packed layout
+        Trace();
     }
 #endif
 
@@ -1496,9 +1506,19 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
 
     virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const;
     // helper for SimpleOutWriter, living in here to be able to use in debugging
-    void WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
+    void WriteMinibatchWithFormatting(FILE* f, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, const std::vector<std::string>& labelMapping,
                                       const std::string& sequenceSeparator, const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator, const std::string& sampleSeparator,
                                       const std::string& valueFormatString) const;
+    void Trace()
+    {
+        if (m_traceNodeValue)
+        {
+            fprintf(stderr, "Trace --> %ls = %ls -> [%s%s]\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
+            WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, true/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
+                                         ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
+                                         "%13.10f"/*valueFormatString*/);
+        }
+    }
 
 protected:
 
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 0c0ae20f1eb8..7c725a1e20ca 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -71,6 +71,12 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
     startEpoch = max(startEpoch, 0);
     m_needAdaptRegularization = false;
 
+    // set tracing flags
+    for (const auto& traceNodeName : m_traceNodeNamesReal)
+        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(false);
+    for (const auto& traceNodeName : m_traceNodeNamesCategory)
+        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(true);
+
     TrainOrAdaptModel(startEpoch, net, loadNetworkFromCheckpoint, net, nullptr, trainSetDataReader, validationSetDataReader);
 }
 
@@ -2585,4 +2591,5 @@ SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
 
 // register SGD<> with the ScriptableObject system
 ScriptableObjects::ConfigurableRuntimeTypeRegister::AddFloatDouble<SGD<float>, SGD<double>> registerSGDOptimizer(L"SGDOptimizer");
-} } }
+
+}}}
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 29035afe43d5..70f0889a0114 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -288,6 +288,8 @@ class SGD : public SGDParams
           // m_validateAfterModelReloading(configSGD(L"validateAfterModelReloading", true)),
           m_trainCriterionNodeName((const wstring&) configSGD(L"trainCriterionNodeName", L"")),
           m_evalCriterionNodeName((const wstring&) configSGD(L"evalCriterionNodeName", L"")),
+          m_traceNodeNamesReal(configSGD(L"traceNodeNamesReal", ConfigRecordType::Array(stringargvector()))),
+          m_traceNodeNamesCategory(configSGD(L"traceNodeNamesCategory", ConfigRecordType::Array(stringargvector()))),
           m_prevChosenMinibatchSize(0),
           m_lastFinishedEpochTrainLoss(0.0),
           m_distGradAgg(nullptr),
@@ -504,6 +506,9 @@ class SGD : public SGDParams
     wstring m_trainCriterionNodeName;
     wstring m_evalCriterionNodeName;
 
+    // enable tracing. Nodes listed here get their m_traceNodeValue and m_traceNodeValueAsCategoryLabel flags set
+    vector<wstring> m_traceNodeNamesReal, m_traceNodeNamesCategory;
+
     size_t m_prevChosenMinibatchSize;
     double m_lastFinishedEpochTrainLoss;
 
@@ -515,4 +520,5 @@ class SGD : public SGDParams
 private:
     int SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...);
 };
-} } }
+
+}}}
diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h
index c98e10119adf..e6a8b057287b 100644
--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@@ -257,7 +257,7 @@ class SimpleOutputWriter
                     fprintfOrDie(f, "%s", sequenceSeparator.c_str());
 
                 auto pnode = dynamic_pointer_cast<ComputationNode<ElemType>>(onode);
-                pnode->WriteMinibatchWithFormatting(f, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping,
+                pnode->WriteMinibatchWithFormatting(f, SIZE_MAX, SIZE_MAX, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping,
                                                     sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator,
                                                     valueFormatString);
             } // end loop over nodes

From 4209d9df1008e4030abd128e78a7bed3b705a332 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 9 Mar 2016 22:22:24 -0800
Subject: [PATCH 03/26] bug fix: LMSequenceReader randomization must be
 deterministic (seed = epoch)

---
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  4 +-
 Source/Common/Include/Config.h                | 18 +++---
 .../ComputationNetwork.cpp                    | 12 +++-
 .../ComputationNetwork.h                      |  1 +
 .../ComputationNetworkEvaluation.cpp          | 62 +++++++++++--------
 .../ComputationNetworkLib/ComputationNode.cpp |  2 +-
 .../ComputationNetworkLib/ComputationNode.h   |  9 +--
 Source/ComputationNetworkLib/RecurrentNodes.h |  3 +-
 .../LMSequenceReader/SequenceReader.cpp       | 11 ++--
 .../Readers/LMSequenceReader/SequenceReader.h |  6 +-
 10 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index e3dbfe8d5759..c0757963f158 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -268,8 +268,8 @@ Parameters =
     Stabilize (x, enabled=true) =
         if enabled
         then [
-                 beta = Exp(ScalarParam())
-                 result = Scale(beta, x)
+                 beta = Exp (ScalarParam())
+                 result = Scale (beta, x)
              ].result
         else x
 ]
diff --git a/Source/Common/Include/Config.h b/Source/Common/Include/Config.h
index 981fc9c5695c..8d43cbc88e70 100644
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@@ -18,14 +18,14 @@ using namespace std;
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 #define FUNCTIONOPEN "("
-#define OPENBRACES "[{(\""
-#define CLOSINGBRACES "]})\""
+#define OPENBRACES    "[{(\"" // all opening braces
+#define CLOSINGBRACES "]})\"" // and matching closing ones
 
 static const std::string::size_type npos = (std::string::size_type) -1;
 
 // These are the constants associated with the "ResolveVariables" method.
-static const char* openBraceVar = "$";
-static const char* closingBraceVar = "$";
+static const char* openBraceVar    = "$"; // beginning of a var
+static const char* closingBraceVar = "$"; // end of a var
 static const char* forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n";
 static const char* forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n";
 static const std::size_t openBraceVarSize = strlen(openBraceVar);
@@ -357,23 +357,19 @@ class ConfigParser
     // str - string to search
     // tokenStart - start location in the string to search
     // returns: character position of matching closing brace, string::npos if no brace present at start position
-    // BUGBUG: This seems to only work for one kind of braces at a time. Nested other braces are not
-    // understood. Also, braces in strings are not protected. [fseide]
-    static std::string::size_type FindBraces(const std::string& str, std::string::size_type tokenStart)
+    static size_t FindBraces(const std::string& str, const size_t tokenStart)
     {
         const auto len = str.length();
         // start is outside (or rather, at end of string): no brace here
         if (tokenStart >= len)
-        {
             return npos;
-        }
 
         // open braces and quote
-        static const std::string openBraces = OPENBRACES;
+        static const std::string openBraces    = OPENBRACES;    // currently "[{(\""
         // close braces and quote
         static const std::string closingBraces = CLOSINGBRACES;
 
-        const auto charsToLookFor = closingBraces + openBraces; // all chars we match for
+        static const auto charsToLookFor = closingBraces + openBraces; // all chars we match for
 
         // get brace index for first character of input string
         const auto braceFound = openBraces.find(str[tokenStart]);
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 7021f6f66275..c0f8d5e3464c 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -218,7 +218,17 @@ void ComputationNetwork::ReadPersistableParameters(File& fstream, bool create)
         if (create) // loaded from scratch
             AddNodeToNet(node);
         else                      // reloaded existing
-            node->Validate(true); // nothing that propagates should have changed  --TODO: have a more rigid mechanism to prevent resizing; this should only reload the model parameters
+        {
+            let old = node->GetSampleLayout();
+            let changed = ValidateNode(node, /*isFinalValidationPass=*/true);
+            if (changed)
+            {
+                let upd = node->GetSampleLayout();
+                fprintf(stderr, "ValidateSubNetwork: %ls %ls operation changed, from [%s] to [%s].", node->NodeName().c_str(), node->OperationName().c_str(),
+                    string(old).c_str(), string(upd).c_str());
+                //LogicError("ValidateSubNetwork: %ls %ls operation changed during reload or re-validation.", node->NodeName().c_str(), node->OperationName().c_str());
+            }
+        }
     }
 
     fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 117b0ffc789a..3e44b9c8a93e 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -165,6 +165,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 private:
     void ValidateNetwork();
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo);
+    bool ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const;
     void MarkValueNonSharableNodes();
 
 private:
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index dc2b224ba356..b813d0ea0371 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -579,6 +579,36 @@ static pair<TensorShape, bool> GetDims(const ComputationNodeBasePtr& node)
     return make_pair(node->GetSampleLayout(), node->HasMBLayout());
 }
 
+bool ComputationNetwork::ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const
+{
+    const auto& children = node->GetInputs();
+
+    // keep state
+    MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout();
+    auto dim = GetDims(node);
+    vector<pair<TensorShape, bool>> childDims;
+    for (auto& child : children)
+        childDims.push_back(GetDims(child));
+    auto sampleLayout = node->GetSampleLayout();
+    // We do call validate(final) as many times as needed, since stuff may have changed underneath.
+    node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference
+    // also take the opportunity to propagate m_needsGradient
+    auto needsGradient = node->m_needsGradient;
+    for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass?
+        node->m_needsGradient |= child->m_needsGradient;
+    // check state --node will be valid if all nodes have been visited and node has not been updated
+    bool unchanged = true;
+    unchanged &= (oldMBLayoutPtr == node->GetMBLayout());
+    unchanged &= (dim == GetDims(node));
+    vector<pair<TensorShape, bool>> newChildDims;
+    for (auto& child : children)
+        newChildDims.push_back(GetDims(child));
+    unchanged &= (childDims == newChildDims);
+    unchanged &= (sampleLayout == node->GetSampleLayout());
+    unchanged &= (needsGradient == node->m_needsGradient);
+    return !unchanged;
+}
+
 void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo)
 {
     todo = 0; // returns how many nodes are to be redone
@@ -596,35 +626,15 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
         }
         // if there is not at least one visited child
         bool valid = false;
-        if (hasVisitedChild || isLeaf)
+        if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
         {
-            // got at least one child: it makes sense to call Validate()
-            // keep state
-            MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout();
-            auto dim = GetDims(node);
-            vector<pair<TensorShape, bool>> childDims;
-            for (auto& child : children)
-                childDims.push_back(GetDims(child));
-            auto sampleLayout = node->GetSampleLayout();
-            // We do call validate(final) as many times as needed, since stuff may have changed underneath.
+            // TODO: PrintSelfBeforeValidation() into a function returning a string, and print all in a single line (also when it throws; print & rethrow).
             node->PrintSelfBeforeValidation();
-            node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference
-            fprintf(stderr, " -> [%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
+            bool unchanged = !ValidateNode(node, isFinalValidationPass);
             node->m_visited = true;
-            // also take the opportunity to propagate m_needsGradient
-            auto needsGradient = node->m_needsGradient;
-            for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass?
-                node->m_needsGradient |= child->m_needsGradient;
-            // check state --node will be valid if all nodes have been visited and node has not been updated
-            bool unchanged = true;
-            unchanged &= (oldMBLayoutPtr == node->GetMBLayout());
-            unchanged &= (dim == GetDims(node));
-            vector<pair<TensorShape, bool>> newChildDims;
-            for (auto& child : children)
-                newChildDims.push_back(GetDims(child));
-            unchanged &= (childDims == newChildDims);
-            unchanged &= (sampleLayout == node->GetSampleLayout());
-            unchanged &= (needsGradient == node->m_needsGradient);
+            fprintf(stderr, "[%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
+            // print the new type
+            // sanity checks
             if (isFinalValidationPass && !unchanged)
                 LogicError("ValidateSubNetwork: %ls %ls operation changed during final validation.", node->NodeName().c_str(), node->OperationName().c_str());
             if (isFinalValidationPass && !allChildrenVisited)
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index d67e2be995b9..6bd6c30888cd 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -307,7 +307,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                 fprintfOrDie(f, "%s", sampleSeparator.c_str());
             if (j == jstop)
             {
-                fprintf(f, "..."); // 'nuff said
+                fprintf(f, "... (%d more)", (int)(jend - jstop)); // 'nuff said
                 break;
             }
             for (size_t i = 0; i < iend; i++)
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 86d310c0379e..474fc7e4be3c 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -167,8 +167,8 @@ struct ComputationNetworkOwnedNodeState
     // These are public since you are meant to set these flags manually in the debugger or temporarily poke into them from code as needed.
     bool m_traceNodeValue = false;
     bool m_traceNodeValueAsCategoryLabel = false;
-    size_t m_traceNodeValueUpToDim = 5;
-    size_t m_traceNodeValueUpToT = 5;
+    size_t m_traceNodeValueUpToDim = 3; // 3 should be enough to see simple patterns such as all values are identical or out of range
+    size_t m_traceNodeValueUpToT = 8;   // 8 time steps fit comfortably into a normal-sized console
     void EnableNodeTracing(bool isCategoryLabel) { m_traceNodeValue = true; m_traceNodeValueAsCategoryLabel = isCategoryLabel; }
 
 protected:                // TODO: should be fully encapsulated here
@@ -1513,8 +1513,9 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
     {
         if (m_traceNodeValue)
         {
-            fprintf(stderr, "Trace --> %ls = %ls -> [%s%s]\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
-            WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, true/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
+            const auto shape = GetTensorShape(DetermineElementwiseTensorRank());
+            fprintf(stderr, "Trace --> %ls = %ls -> [%s]\n", NodeName().c_str(), OperationName().c_str(), string(shape).c_str());
+            WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
                                          ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
                                          "%13.10f"/*valueFormatString*/);
         }
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index d6cebbdcc95e..d9203c1ebeeb 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -167,7 +167,8 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
 
         // BUGBUG: I got an error in when reloading persistent parameterse for a model that had dimension specified as 0, which did not get re-inferred correctly.
         //         We should either simply not write this parameter out at all (since it can always be inferred), or write the tensor shape.
-        SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
+        if (GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date
+            SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
         m_delayedValue.Resize(rows, 0);                                                  // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
 
         if (modelVersion >= CNTK_MODEL_VERSION_2)
diff --git a/Source/Readers/LMSequenceReader/SequenceReader.cpp b/Source/Readers/LMSequenceReader/SequenceReader.cpp
index d42831b8ecbb..838f56402fc8 100644
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@@ -1578,7 +1578,7 @@ void BatchSequenceReader<ElemType>::Reset()
 {
     mProcessed.clear();
     mToProcess.clear();
-    mLastProcssedSentenceId = 0;
+    mLastProcessedSentenceId = 0;
     mPosInSentence = 0;
     mLastPosInSentence = 0;
     mNumRead = 0;
@@ -1651,6 +1651,7 @@ void BatchSequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epo
     // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
     size_t epochSize = m_epochSize == requestDataSize ? 1000 : m_epochSize;
     m_epoch = epoch;
+    m_randomSeed = (unsigned int)m_epoch;
     m_mbStartSample = epoch * m_epochSize;
     m_epochSamplesReturned = 0;     // counter to know when we returned one epoch
 
@@ -1700,7 +1701,7 @@ size_t BatchSequenceReader<ElemType>::DetermineSequencesToProcess()
             int mp = (int) mToProcess[s];
             if (mProcessed[mp])
             {
-                mLastProcssedSentenceId = mp;
+                mLastProcessedSentenceId = mp;
                 mLastPosInSentence = 0;
                 allDone = true;
                 break;
@@ -1722,7 +1723,7 @@ size_t BatchSequenceReader<ElemType>::DetermineSequencesToProcess()
     size_t maxToProcess = mRequestedNumParallelSequences > 0 ? mRequestedNumParallelSequences : SIZE_MAX; // if mRequestedNumParallelSequences is 0 then we go by MB size
     size_t maxTokens    = mRequestedNumParallelSequences > 0 ?                       SIZE_MAX : m_mbSize;
     size_t numTokens = 0;  // token counter
-    for (size_t seq = mLastProcssedSentenceId;
+    for (size_t seq = mLastProcessedSentenceId;
          seq < mNumRead &&                 // hit end of buffer
          mToProcess.size() < maxToProcess; // hit parallel-sequence limit
          seq++)
@@ -1791,14 +1792,14 @@ bool BatchSequenceReader<ElemType>::GetMinibatchData(size_t& /*out*/ firstPosInS
 #ifdef _MSC_VER // make some old configurations reproducable (m_cacheBlockSize used to be a constant)  --TODO: remove in a few months
         if (m_cacheBlockSize == 50000)
         {
+            srand(++m_randomSeed); // TODO: older code did not have that; so no idea what random seed was used
             std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end());
             // Note: random_shuffle is deprecated since C++14.
         }
         else // new configs use a wider randomization
 #endif
         {
-            std::random_device rd;
-            std::mt19937 g(rd());
+            std::mt19937 g(++m_randomSeed); // random seed is initialized to epoch, but gets incremented for intermediate reshuffles
             std::shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end(), g);
         }
 
diff --git a/Source/Readers/LMSequenceReader/SequenceReader.h b/Source/Readers/LMSequenceReader/SequenceReader.h
index cac0a9bd957d..4940edcd0ddb 100644
--- a/Source/Readers/LMSequenceReader/SequenceReader.h
+++ b/Source/Readers/LMSequenceReader/SequenceReader.h
@@ -354,7 +354,9 @@ class BatchSequenceReader : public SequenceReader<ElemType>
     using Base::mRequestedNumParallelSequences; // IDataReader<ElemType>
 
 private:
-    size_t mLastProcssedSentenceId;
+    unsigned int m_randomSeed = 0; // deterministic random seed
+
+    size_t mLastProcessedSentenceId;
 
     size_t mNumRead;               // number of sentences in current cache block
     vector<bool> mProcessed;       // [mNumRead] true if sequence has already been returned in this cache block
@@ -379,7 +381,7 @@ class BatchSequenceReader : public SequenceReader<ElemType>
     BatchSequenceReader()
         : m_pMBLayout(make_shared<MBLayout>())
     {
-        mLastProcssedSentenceId = 0;
+        mLastProcessedSentenceId = 0;
         mRequestedNumParallelSequences = 1;
         mLastPosInSentence = 0;
         mNumRead = 0;

From 5783c61a81d448736a509ce953ec662ce7707f32 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 10 Mar 2016 08:13:49 -0800
Subject: [PATCH 04/26] added missing optional imageLayout parameter to NDL of
 Reshape()

---
 Source/CNTK/SynchronousExecutionEngine.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp
index 01d78cb6835e..9e2431e895df 100644
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@@ -252,8 +252,9 @@ void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const
             size_t img_width = node->GetOptionalParameter("imageWidth", "0");
             size_t img_height = node->GetOptionalParameter("imageHeight", "0");
             size_t img_channels = node->GetOptionalParameter("imageChannels", "0");
+            ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
 
-            nodePtr = builder.LegacyReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC /*legacy*/), name); // BUGBUG: use a tensor descriptor instead
+            nodePtr = builder.LegacyReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, imageLayoutKind), name);
         }
     }
     else if (cnNodeType == OperationNameOf(PastValueNode) ||

From 7b9558a18ad37179dbc7268e75cb1dd852bc8e56 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 10 Mar 2016 08:18:25 -0800
Subject: [PATCH 05/26] (bug fix: NDL Reshape should only allow 2 positional
 parameters)

---
 Source/CNTK/SynchronousExecutionEngine.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp
index 9e2431e895df..18fe4a6c9a21 100644
--- a/Source/CNTK/SynchronousExecutionEngine.cpp
+++ b/Source/CNTK/SynchronousExecutionEngine.cpp
@@ -206,7 +206,7 @@ void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const
     else if (cnNodeType == OperationNameOf(RowRepeatNode))
     {
         if (parameter.size() != 2)
-            RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats.");
+            RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats).");
 
         nodeParamCount = 1;
         nodeParamStart = 0;
@@ -238,8 +238,8 @@ void SynchronousNodeEvaluator<ElemType>::Evaluate(NDLNode<ElemType>* node, const
     }
     else if (cnNodeType == L"Reshape" /*OperationNameOf(ReshapeNode)*/)
     {
-        if (parameter.size() < 2 || parameter.size() > 5)
-            RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
+        if (parameter.size() != 2)
+            RuntimeError("Reshape should have two parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=].");
 
         nodeParamCount = 1;
         nodeParamStart = 0;

From e64aec5ce27e9bda6d2a9cf1310841ffe48ed05d Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 10 Mar 2016 11:10:00 -0800
Subject: [PATCH 06/26] refined validation output

---
 .../lyx/CNTKBook_CNTK_Programmer_Chapter.lyx  |  5 --
 .../ComputationNetwork.h                      |  2 +-
 .../ComputationNetworkEvaluation.cpp          | 33 +++++++---
 .../ComputationNetworkLib/ComputationNode.cpp | 63 +++++++++++++++++++
 .../ComputationNetworkLib/ComputationNode.h   | 49 +++------------
 Source/ComputationNetworkLib/ReshapingNodes.h | 22 ++-----
 6 files changed, 101 insertions(+), 73 deletions(-)

diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx
index be90521b3176..0d3994eacfe2 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx
@@ -3134,11 +3134,6 @@ virtual void Validate()
 {             
 \end_layout
 
-\begin_layout Plain Layout
-
-    PrintSelfBeforeValidation();
-\end_layout
-
 \begin_layout Plain Layout
 
             
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 3e44b9c8a93e..074061ad688c 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -164,7 +164,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
 private:
     void ValidateNetwork();
-    void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo);
+    size_t ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFirstPass, bool isFinalValidationPass);
     bool ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const;
     void MarkValueNonSharableNodes();
 
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index b813d0ea0371..8615dce8edd7 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -527,16 +527,16 @@ void ComputationNetwork::ValidateNetwork()
     //    Keep going through the list until all nodes have been validated and all inputs have been validated as well.
     //  - validate (final)              // final means consistency checks
     //    Fail if any change during this stage.
-    size_t pass = 0;
+    size_t pass = 1;
     size_t toValidate = nodes.size();
     while (toValidate > 0)
     {
-        pass++;
         fprintf(stderr, "\n\nValidating network. %d nodes to process in pass %d.\n", (int) toValidate, (int) pass);
-        ValidateNodes(nodes, false /*isFinalValidationPass*/, toValidate);
+        toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, false /*isFinalValidationPass*/);
+        pass++;
     }
     fprintf(stderr, "\n\nValidating network, final pass.\n");
-    ValidateNodes(nodes, true /*isFinalValidationPass*/, toValidate);
+    toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/);
     if (toValidate != 0)
         LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");
 
@@ -609,9 +609,11 @@ bool ComputationNetwork::ValidateNode(ComputationNodeBasePtr node, bool isFinalV
     return !unchanged;
 }
 
-void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo)
+// perform one pass of validation over the topologically-sorted node set
+// returns how many nodes either could not yet be validated yet or have changed and thus must be redone
+size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFirstPass, bool isFinalValidationPass)
 {
-    todo = 0; // returns how many nodes are to be redone
+    size_t todo = 0;
     for (auto& node : nodes)
     {
         const auto& children = node->GetInputs();
@@ -628,11 +630,21 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
         bool valid = false;
         if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
         {
-            // TODO: PrintSelfBeforeValidation() into a function returning a string, and print all in a single line (also when it throws; print & rethrow).
-            node->PrintSelfBeforeValidation();
-            bool unchanged = !ValidateNode(node, isFinalValidationPass);
+            string prevPrototype = node->FormatOperationPrototype("");
+            bool unchanged;
+            try
+            {
+                unchanged = !ValidateNode(node, isFinalValidationPass);
+                string updatedPrototype = node->FormatOperationPrototype("");
+                if (isFirstPass || !unchanged || prevPrototype != updatedPrototype)
+                    fprintf(stderr, "Validating --> %s\n", updatedPrototype.c_str());
+            }
+            catch (...) // if validation failed then print the prototype anyway so one can see the input args
+            {
+                fprintf(stderr, "Validating --> %s FAILED\n", prevPrototype.c_str());
+                throw;
+            }
             node->m_visited = true;
-            fprintf(stderr, "[%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
             // print the new type
             // sanity checks
             if (isFinalValidationPass && !unchanged)
@@ -646,6 +658,7 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
         if (!valid)
             todo++;
     }
+    return todo;
 }
 
 // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 6bd6c30888cd..387ee95969fa 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -208,6 +208,69 @@ TensorShape ComputationNodeBase::GetTensorSliceFor(size_t rank, const FrameRange
 // others
 // -----------------------------------------------------------------------
 
+/*virtual*/ string ComputationNodeBase::FormatOperationPrototype(const string& extraArgs) const
+{
+    string prototype;
+    prototype += msra::strfun::strprintf("%ls = %ls", NodeName().c_str(), OperationName().c_str());
+
+    // arguments of operation
+    if (IsLeaf())
+        prototype += "()";
+    else
+    {
+        prototype += " (";
+        for (size_t i = 0; i < GetNumInputs(); i++)
+        {
+            const auto& child = m_inputs[i];
+            if (i > 0)
+                prototype += ", ";
+
+            if (child)
+                prototype += msra::strfun::strprintf("%ls", child->NodeName().c_str());
+            else
+                prototype += "NULL";
+        }
+        prototype += extraArgs;
+        prototype += ")";
+    }
+
+    // type (tensor dimensions) of operation
+    prototype += " : ";
+
+    if (!IsLeaf())
+    {
+        //prototype += "(";
+        for (size_t i = 0; i < GetNumInputs(); i++)
+        {
+            const auto& child = m_inputs[i];
+            if (i > 0)
+                prototype += ", ";
+
+            if (child == nullptr)
+            {
+                prototype += "NULL";
+                continue;
+            }
+
+            const char* mbSizeMark = child->m_pMBLayout ? " x *" : "";
+#if 0
+            if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
+                prototype += msra::strfun::strprintf("%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark,
+                child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]);
+            // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
+            else
+#endif
+                prototype += msra::strfun::strprintf("[%s%s]", string(child->m_sampleLayout).c_str(), mbSizeMark);
+        }
+        prototype += extraArgs;
+        //prototype += ")";
+    }
+
+    prototype += msra::strfun::strprintf(" -> [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
+
+    return prototype;
+}
+
 template <class ElemType>
 /*virtual*/ void ComputationNode<ElemType>::DumpNodeInfo(const bool /*printValues*/, const bool printMetadata, File& fstream) const
 {
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 474fc7e4be3c..07a1928633b6 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -108,7 +108,7 @@ struct /*interface*/ IComputationNode
 
     // --- optional overrides for more informative logging
 
-    virtual void PrintSelfBeforeValidation() const = 0; // called in validation loop right before Validate()
+    virtual std::string FormatOperationPrototype(const std::string& extraArgs) const = 0; // format the operation into a "prototype" (listing dimensions and parameters)
     virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const = 0;
 
 protected:
@@ -592,7 +592,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab
     /*HasName::*/ void SetName(const std::wstring& newName) override // also for use by ExperimentalNetworkBuilder
     {
         m_nodeName = newName;
-        fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
+        //fprintf(stderr, "Node --> %ls : %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr);
     }
 
     bool NeedsGradient() const { return m_needsGradient; }
@@ -786,36 +786,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab
     virtual void PrintSelf(bool printMatrices = false) const = 0;
 
     // called in validation loop right before Validate()
-    virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const
-    {
-        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
-
-        if (!IsLeaf())
-        {
-            fprintf(stderr, "(");
-            for (size_t i = 0; i < GetNumInputs(); i++)
-            {
-                const auto& child = m_inputs[i];
-                if (i > 0)
-                    fprintf(stderr, ", ");
-
-                if (child == nullptr)
-                {
-                    fprintf(stderr, "NULL");
-                    continue;
-                }
-
-                const char* mbSizeMark = child->m_pMBLayout ? " x *" : "";
-                if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation
-                    fprintf(stderr, "%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark,
-                            child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]);
-                // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct.
-                else
-                    fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark);
-            }
-            fprintf(stderr, ")");
-        }
-    }
+    virtual std::string /*IComputationNode::*/ FormatOperationPrototype(const std::string& extraArgs) const;
 
     // helper for topology plot: enumerate arcs that can be reached starting from the current node's children
     typedef std::pair<ComputationNodeBasePtr, ComputationNodeBasePtr> ComputationArc;
@@ -1300,11 +1271,11 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
         VerifyDataSize(Value());
     }
 
-#ifdef _DEBUG
     // NaN checks
     virtual void /*IComputationNode::*/ EndForwardProp() override
     {
         Base::EndForwardProp();
+#ifdef _DEBUG
 #ifdef TRACK_GAP_NANS
         MaskMissingValueColumnsToZero(FrameRange(m_pMBLayout)); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
         if (Value().HasNan("EndForwardProp"))
@@ -1315,9 +1286,10 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
         Value().Print(msra::strfun::utf8(NodeName()), 0, min(Value().GetNumRows()-1, 4), 0, min(Value().GetNumCols()-1, 4));
 #endif
         InvalidateMissingValueColumns(FrameRange(m_pMBLayout)); // blast NaNs into columns that are gaps in a packed layout
+#endif
+        // tracing
         Trace();
     }
-#endif
 
 #if 0   // (keep it around in case we need to add stuff in the future)
         virtual void /*IComputationNode::*/BeginBackprop() override
@@ -1511,10 +1483,9 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
                                       const std::string& valueFormatString) const;
     void Trace()
     {
-        if (m_traceNodeValue)
+        if (m_traceNodeValue+1)
         {
-            const auto shape = GetTensorShape(DetermineElementwiseTensorRank());
-            fprintf(stderr, "Trace --> %ls = %ls -> [%s]\n", NodeName().c_str(), OperationName().c_str(), string(shape).c_str());
+            fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str());
             WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
                                          ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
                                          "%13.10f"/*valueFormatString*/);
@@ -1720,7 +1691,7 @@ class FlowControlNode : public ComputationNodeBase
     virtual std::wstring ToString(void) const override { NOT_IMPLEMENTED; }
     // these are meant to be called during computation, so provide dummy implementations
     virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
-    virtual void PrintSelfBeforeValidation() const override { }
+    virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { return ""; }
     virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {}
 
 protected: public:                                     // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode
@@ -1847,7 +1818,7 @@ protected:
     using Base::MarkValueNonSharable;                                                                                                                    \
     using Base::OutputUsedInComputingInputNodesGradients;                                                                                                \
     using Base::PrintNodeValuesToFile;                                                                                                                   \
-    using Base::PrintSelfBeforeValidation;                                                                                                               \
+    using Base::FormatOperationPrototype;                                                                                                               \
     using Base::ReleaseMatricesAfterBackprop;                                                                                                            \
     using Base::ReleaseMatricesAfterForwardProp;                                                                                                         \
     using Base::ReleaseMatrixToPool;                                                                                                                     \
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 3c5adb033377..f5f60309776c 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -533,10 +533,9 @@ class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
         fstream >> m_numRepeat;
     }
 
-    virtual void PrintSelfBeforeValidation() const override
+    virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override
     {
-        Base::PrintSelfBeforeValidation();
-        fprintf(stderr, ", numRepeats=%lu", m_numRepeat);
+        return Base::FormatOperationPrototype(extraArgs + msra::strfun::strprintf(", numRepeats=%lu", m_numRepeat));
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@@ -847,22 +846,9 @@ class LegacyReshapeNode : public ReinterpretNodeBase<ElemType>
         m_targetImageLayout.Save(fstream);
     }
 
-    virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const override
+    virtual std::string /*IComputationNode::*/ FormatOperationPrototype(const std::string& extraArgs) const override
     {
-        fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str());
-        fprintf(stderr, "(");
-        for (size_t i = 0; i < GetNumInputs(); i++)
-        {
-            ComputationNodePtr child = Input(i);
-            if (i > 0)
-                fprintf(stderr, ", ");
-            if (!child)
-                fprintf(stderr, "NULL");
-            else
-                fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->GetSampleLayout()).c_str(), child->HasMBLayout() ? " x *" : "");
-        }
-        fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]);
-        // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn.
+        return Base::FormatOperationPrototype(extraArgs + msra::strfun::strprintf(", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]));
     }
 
     // TODO: Clarify/resolve the semantic overlap between BeginForwardProp() and UpdateFunctionMBSize().

From 52929f9f265443ab51ced9f89885c7c862d73267 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 11 Mar 2016 06:56:38 -0800
Subject: [PATCH 07/26] cleaned up log output

---
 Examples/Text/PennTreebank/Config/rnn.cntk       | 11 ++++++-----
 Source/CNTK/BrainScript/BrainScriptEvaluator.cpp |  2 +-
 Source/CNTK/BrainScript/BrainScriptParser.cpp    |  6 +++---
 Source/CNTK/CNTK.cpp                             | 16 +++++++++-------
 .../ComputationNetworkEvaluation.cpp             |  6 +++---
 .../ComputationNetworkScripting.cpp              |  4 ----
 Source/ComputationNetworkLib/ComputationNode.h   |  2 +-
 Source/SGDLib/SGD.cpp                            | 16 ++++++++--------
 8 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/Examples/Text/PennTreebank/Config/rnn.cntk b/Examples/Text/PennTreebank/Config/rnn.cntk
index d7bcf156b059..c158d39e2ac7 100644
--- a/Examples/Text/PennTreebank/Config/rnn.cntk
+++ b/Examples/Text/PennTreebank/Config/rnn.cntk
@@ -54,11 +54,8 @@ writeWordAndClassInfo = [
 
 train = [
     action = "train"
-    minibatchSize = 128:256:512 # TODO: Why is this here and not inside SGD?
     traceLevel = 1
     epochSize = 0               # (for quick tests, this can be overridden with something small)
-    defaultHiddenActivity = 0.1 # default value for hidden states--is this used by SimpleNetworkBuilder?
-    useValidation = true
 
     SimpleNetworkBuilder = [
         rnnType = "CLASSLSTM"   # TODO: camelCase
@@ -70,6 +67,7 @@ train = [
         initValueScale = 6.0
         uniformInit = true
         layerSizes = "$confVocabSize$:150:200:10000"
+        defaultHiddenActivity = 0.1 # default value for hidden states
         addPrior = false
         addDropoutNodes = false
         applyMeanVarNorm = false
@@ -81,6 +79,7 @@ train = [
     ]
 
     SGD = [
+        minibatchSize = 128:256:512
         learningRatesPerSample = 0.1
         momentumPerMB = 0
         gradientClippingWithTruncation = true
@@ -90,6 +89,10 @@ train = [
         gradUpdateType = "none"
         loadBestModel = true
 
+        dropoutRate = 0.0
+
+        #traceNodeNamesReal = AutoName37 # this allows to track a node's value
+
         # settings for Auto Adjust Learning Rate
         AutoAdjust = [
             autoAdjustLR = "adjustAfterEpoch"
@@ -102,8 +105,6 @@ train = [
             numPrevLearnRates = 5
             numBestSearchEpoch = 1
         ]
-
-        dropoutRate = 0.0
     ]
 
     reader = [
diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
index a8b2904d30a7..d333c1c679eb 100644
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@@ -53,7 +53,7 @@ class EvaluationException : public ConfigException
 
 __declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
 {
-    Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
+    //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
     throw EvaluationException(msg, where);
 }
 
diff --git a/Source/CNTK/BrainScript/BrainScriptParser.cpp b/Source/CNTK/BrainScript/BrainScriptParser.cpp
index 834fe04334a2..dab31e32b782 100644
--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@@ -184,7 +184,7 @@ class CodeSource
 
     __declspec_noreturn static void Fail(wstring msg, TextLocation where)
     {
-        Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
+        //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
         throw CodeSourceException(msg, where);
     }
 
@@ -375,7 +375,7 @@ class Lexer : public CodeSource
 private:
     __declspec_noreturn static void Fail(wstring msg, Token where)
     {
-        Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
+        //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
         throw LexerException(msg, where.beginLocation);
     }
 
@@ -606,7 +606,7 @@ class Parser : public Lexer
 
     __declspec_noreturn static void Fail(const wstring& msg, Token where)
     {
-        Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
+        //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
         throw ParseException(msg, where.beginLocation);
     }
 
diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index 3db4a6cb5d2f..c35ee681ca53 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -579,11 +579,13 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
     std::string timestamp = TimeDateStamp();
 
     // dump config info
-    fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
-    fprintf(stderr, "command line: \n");
+    fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
+    fprintf(stderr, "Command line: \n");
     for (int i = 0; i < argc; i++)
         fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
+    fprintf(stderr, "\n\n");
 
+#if 1 //def _DEBUG
     // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
     // and prints it.
     fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
@@ -601,12 +603,12 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
     fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
     config.dumpWithResolvedVariables();
     fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
+#endif
 
-    fprintf(stderr, "Commands: ");
+    fprintf(stderr, "Commands:");
     for (int i = 0; i < command.size(); i++)
-    {
-        fprintf(stderr, "%s ", command[i].c_str());
-    }
+        fprintf(stderr, " %s", command[i].c_str());
+    fprintf(stderr, "\n");
 
     // run commands
     std::string type = config(L"precision", "float");
@@ -614,7 +616,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
     if (config.Exists("type"))
         InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");
 
-    fprintf(stderr, "\nPrecision = \"%s\"\n", type.c_str());
+    fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
     if (type == "float")
         DoCommands<float>(config);
     else if (type == "double")
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 8615dce8edd7..957c257da5fa 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -430,7 +430,7 @@ void ComputationNetwork::CompileNetwork()
     // STEP: Some final details.
     ResetEvalTimeStamps(); // invalidate all m_value fields. Really belongs into StartEvaluateMinibatchLoop()
 
-    fprintf(stderr, "\nPost-processing network complete.\n");
+    fprintf(stderr, "\nPost-processing network complete.\n\n");
     m_isCompiled = true;
 }
 
@@ -531,11 +531,11 @@ void ComputationNetwork::ValidateNetwork()
     size_t toValidate = nodes.size();
     while (toValidate > 0)
     {
-        fprintf(stderr, "\n\nValidating network. %d nodes to process in pass %d.\n", (int) toValidate, (int) pass);
+        fprintf(stderr, "\nValidating network. %d nodes to process in pass %d.\n\n", (int) toValidate, (int) pass);
         toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, false /*isFinalValidationPass*/);
         pass++;
     }
-    fprintf(stderr, "\n\nValidating network, final pass.\n");
+    fprintf(stderr, "\nValidating network, final pass.\n\n");
     toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/);
     if (toValidate != 0)
         LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");
diff --git a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
index acc3d004ab2d..e7f1be1803a9 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@@ -107,10 +107,6 @@ ComputationNetwork::ComputationNetwork(const IConfigRecordPtr configp)
 
     // perform all necessary post-processing
     CompileNetwork();
-#if 1
-    wstring args = ToString();
-    fprintf(stderr, "%ls\n", args.c_str());
-#endif
 }
 
 // ===================================================================
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 07a1928633b6..2f859c1d6495 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -1483,7 +1483,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
                                       const std::string& valueFormatString) const;
     void Trace()
     {
-        if (m_traceNodeValue+1)
+        if (m_traceNodeValue)
         {
             fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str());
             WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 7c725a1e20ca..82404e2f2b5b 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -49,21 +49,21 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
     }
 
     wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1);
-    bool loadNetworkFromCheckpoint = false;
-    if (startEpoch >= 0)
-    {
-        loadNetworkFromCheckpoint = true;
-        fprintf(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
-    }
+    bool loadNetworkFromCheckpoint = startEpoch >= 0;
+    if (loadNetworkFromCheckpoint)
+        fprintf(stderr, "\nStarting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str());
+    else
+        fprintf(stderr, "\nCreating virgin network.\n");
 
     // create or load from checkpoint
     shared_ptr<ComputationNetwork> net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelFileName);
 
     // log the device we are computing on
+    fprintf(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes());
     if (net->GetDeviceId() < 0)
-        fprintf(stderr, "\nSGD using CPU.\n");
+        fprintf(stderr, " on CPU.\n");
     else
-        fprintf(stderr, "\nSGD using GPU %d.\n", (int) net->GetDeviceId());
+        fprintf(stderr, " on GPU %d.\n", (int) net->GetDeviceId());
 
     // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model
     // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters.

From 63c50f6364a72c2bd3ea9d0c35fc186ec6991641 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 11 Mar 2016 16:57:14 -0800
Subject: [PATCH 08/26] added new node Where (only BS)

---
 Makefile                                      |   1 +
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |   1 +
 Source/Common/Include/Sequences.h             |  16 ++
 .../ComputationNetworkBuilder.cpp             |   1 +
 .../ComputationNetworkLib.vcxproj             |   3 +-
 .../ComputationNetworkLib.vcxproj.filters     |   3 +
 .../ComputationNetworkLib/ComputationNode.h   |   1 +
 .../LinearAlgebraNodes.h                      |  37 +---
 .../ComputationNetworkLib/NonlinearityNodes.h |  16 +-
 Source/ComputationNetworkLib/RecurrentNodes.h |  16 +-
 .../ComputationNetworkLib/ReshapingNodes.cpp  | 175 ++++++++++++++++++
 Source/ComputationNetworkLib/ReshapingNodes.h |  60 +++---
 .../SpecialPurposeNodes.h                     |  16 +-
 Source/ComputationNetworkLib/TrainingNodes.h  |  16 +-
 Source/Math/CPUMatrix.cpp                     |  13 +-
 Source/Math/Matrix.h                          |  45 ++---
 16 files changed, 269 insertions(+), 151 deletions(-)
 create mode 100644 Source/ComputationNetworkLib/ReshapingNodes.cpp

diff --git a/Makefile b/Makefile
index 69cdbf3654e2..4aeac0b21b35 100644
--- a/Makefile
+++ b/Makefile
@@ -529,6 +529,7 @@ CNTK_SRC =\
 	$(SOURCEDIR)/CNTK/SynchronousExecutionEngine.cpp \
 	$(SOURCEDIR)/CNTK/tests.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNode.cpp \
+	$(SOURCEDIR)/ComputationNetworkLib/ReshapingNodes.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetwork.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkEvaluation.cpp \
 	$(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkAnalysis.cpp \
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index c0757963f158..97498816251d 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -98,6 +98,7 @@ SumElements(matrix, tag='') = new ComputationNode [ operation = 'SumElements' ;
 Tanh(z, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = z /*plus the function args*/ ]
 TimeReverse(vectorSequence, tag='') = new ComputationNode [ operation = 'TimeReverse' ; inputs = vectorSequence /*plus the function args*/ ]
 TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'TransposeTimes' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
+Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ]
 
 ##############################################################################
 # common macros
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 286c9ac4f3f6..98f98aa51866 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -364,6 +364,22 @@ struct MBLayout
         return false;
     }
 
+    // -------------------------------------------------------------------
+    // indexing
+    // -------------------------------------------------------------------
+
+    // get the matrix-column index for a given time step in a given sequence
+    size_t GetColumnIndex(const SequenceInfo& seq, size_t t) const
+    {
+        if (t > seq.GetNumTimeSteps())
+            LogicError("GetColumnIndex: t out of sequence bounds.");
+        ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin;
+        if (tIn < 0 || (size_t)tIn >= GetNumTimeSteps())
+            LogicError("GetColumnIndex: Attempted to access a time step that is accessing a portion of a sequence that is not included in current minibatch."); // we may encounter this for truncated BPTT
+        size_t col = (size_t)tIn * GetNumParallelSequences() + seq.s;
+        return (size_t)col;
+    }
+
 private:
     // we are trying to access content--this verifies that the structure is consistent
     // All frames must now be declared.
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index 35f8ce459712..a623d8da72bd 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -95,6 +95,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(TimesNode))                            return New<TimesNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(TransposeDimensionsNode))              return New<TransposeDimensionsNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(TransposeTimesNode))                   return New<TransposeTimesNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(WhereNode))                            return New<WhereNode<ElemType>>(forward<_Types>(_Args)...);
     // legacy names we also support for back compat of model-files
     else if (nodeType == L"ColumnElementTimes")                                 return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == L"Delay")                                              return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
index 79200885a378..2c08f30899a9 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@@ -170,8 +170,9 @@
     <ClCompile Include="ComputationNetworkEvaluation.cpp" />
     <ClCompile Include="ComputationNetworkScripting.cpp" />
     <ClCompile Include="ComputationNode.cpp" />
+    <ClCompile Include="ReshapingNodes.cpp" />
     <ClCompile Include="stdafx.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
\ No newline at end of file
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
index b6cac5de7e22..ac39ae84bf79 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@@ -37,6 +37,9 @@
     <ClCompile Include="ComputationNetworkScripting.cpp">
       <Filter>Network</Filter>
     </ClCompile>
+    <ClCompile Include="ReshapingNodes.cpp">
+      <Filter>Nodes</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Common\Include\fileutil.h">
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 2f859c1d6495..b42e1098ca17 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -276,6 +276,7 @@ class ComputationNodeBase : public IComputationNode,
     ComputationNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
         : m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_learningRateMultiplier(0), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name)
     {
+        // TODO: should m_learningRateMultiplier be set to 0? Or should every node have a way to add its own say on the learning rate for all its inputs?
     }
     virtual ~ComputationNodeBase()
     {
diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
index 064e1bf806a3..bf4b9f22e461 100644
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@@ -137,20 +137,8 @@ class NegateNode : public ComputationNode<ElemType>, public NumInputs<1>
         Input(0)->GradientFor(fr) -= GradientFor(fr);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The NegateNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The NegateNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
@@ -460,10 +448,7 @@ class ElementTimesNode : public BinaryElementWiseNode<ElemType>
         inputGradient.AddElementwiseProductOf(gradient, otherInputValue);
     }
 
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        return true;
-    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return true; }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
@@ -689,20 +674,8 @@ class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs
         sliceInputGrad += sliceOutputGrad; // here the assumption is that sliceOutputGrad is a row vector
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The SumColumnElementsNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The SumColumnElementsNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index 983ea594630b..a2cd5421d631 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -230,13 +230,7 @@ class SoftmaxNode : public SoftmaxNodeBase<ElemType>
     {
     }
 
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The plus node does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
     {
@@ -304,13 +298,7 @@ class LogSoftmaxNode : public SoftmaxNodeBase<ElemType>
     {
     }
 
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The plus node does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues)
     {
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index d9203c1ebeeb..feb7e485a20b 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -232,20 +232,8 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         }
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The DelayedValueNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The DelayedValueNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     virtual void EndForwardProp() override // called after last iteration step of ForwardProp()
     {
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
new file mode 100644
index 000000000000..dc1734a44987
--- /dev/null
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// ReshapingNodes.cpp -- collection of nodes that reshape or sub-sample matrices leading to layout changes
+//
+
+#include "Basics.h"
+#include "ReshapingNodes.h"
+#include "Matrix.h"
+#include "ComputationNode.h"
+#include "Sequences.h"
+
+#include <unordered_set>
+#include <map>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <list>
+#include <memory>
+#include <algorithm>
+#include <assert.h>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// -----------------------------------------------------------------------
+// Where(bitVector) -- extract indices of non-0 values in a sequence
+// -----------------------------------------------------------------------
+
+// TODO: move to MBLayout as a static method
+// packing algorithm
+//  - width: maximum width of structure; set to maximum over sequence lengths
+//  - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used)
+//  - [out] *pMBLayout: MBLayout that describes the created packed sequence set
+//  - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations)
+template<typename SequenceInfoVector>
+static void PackSequences(const SequenceInfoVector& inputSequences,
+    /*ref->out*/MBLayoutPtr pMBLayout,
+    /*temp buffer*/std::vector<std::pair<size_t, size_t>>& placement,
+    /*temp buffer*/std::vector<size_t> rowAllocations)
+{
+    placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps)
+    // determine width of MBLayout
+    size_t width = 0;
+    for (size_t i = 0; i < inputSequences.size(); i++)
+        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+            continue;
+        else if (width < inputSequences[i].GetNumTimeSteps())
+            width = inputSequences[i].GetNumTimeSteps();
+    // allocate
+    rowAllocations.clear();             // [row] we build rows one by one
+    for (size_t i = 0; i < inputSequences.size(); i++)
+    {
+        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+            continue;
+        let len = inputSequences[i].GetNumTimeSteps();
+        // first see if we find a row that has enough space
+        size_t s;
+        for (s = 0; s < rowAllocations.size(); s++)
+            if (rowAllocations[s] + len <= width)
+                break; // yep, it fits
+        // we did not find a s that fit then create a new one
+        if (s == rowAllocations.size())
+            rowAllocations.push_back(0);
+        // sequence goes to (s, rowAllocations[s])
+        placement[i] = make_pair(s, rowAllocations[s]);
+        // and allocate it
+        rowAllocations[s] += len;
+    }
+    // create MBLayout
+    pMBLayout->Init(rowAllocations.size(), width);
+    for (size_t i = 0; i < inputSequences.size(); i++)
+    {
+        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+            continue;
+        size_t s, tBegin; tie
+        (s, tBegin) = placement[i];
+        pMBLayout->AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps());
+    }
+    // need to fill the gaps as well
+    for (size_t s = 0; s < rowAllocations.size(); s++)
+        pMBLayout->AddGap(s, (size_t)rowAllocations[s], width);
+}
+
+// wrapper class to pass MBLayout sequence vector to PackSequences()
+struct SequenceLengthVector
+{
+    typedef vector<vector<size_t>> SequenceVector;
+    typedef MBLayout::SequenceInfo SequenceInfo;
+    const SequenceVector& sequenceVector;       // 
+    const vector<SequenceInfo>& sequenceInfo;    // original sequence info (for seqId)
+    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : sequenceInfo(sequenceInfo), sequenceVector(sequenceVector) { }
+    size_t size() const { return sequenceInfo.size(); }
+    MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
+    {
+        SequenceInfo seq;
+        seq.seqId = sequenceInfo[i].seqId;
+        seq.s = i;
+        seq.tBegin = 0;
+        seq.tEnd = sequenceVector[i].size();
+        return seq;
+    }
+    void operator=(const SequenceLengthVector&) = delete;
+};
+
+// TODO: Where should the MBLayout be created--in BeginForwardProp() or ForwardProp()?
+//       BeginForwardProp() should generally have no access to the actual values,
+//       while ForwardProp() might be too late. We may have to define the semantics here.
+// BUGBUG: This is the first node with value-dependent MBLayout. It resizes Value(), which we otherwise always do before.
+template <class ElemType>
+/*virtual*/ void WhereNode<ElemType>::ForwardPropNonLooping() /*override*/
+{
+    // gather all sequences
+    let& inMBLayout = Input(0)->GetMBLayout();
+    let& input = Input(0)->Value();
+    let& sequences = inMBLayout->GetAllSequences();
+    auto& indexSequences = m_indexSequenceBuffer;
+    if (indexSequences.size() < sequences.size())
+        indexSequences.resize(sequences.size());
+    for (size_t i = 0; i < sequences.size(); i++)
+    {
+        let& seq = sequences[i];
+        if (seq.seqId == GAP_SEQUENCE_ID)
+            continue;
+        auto& indexSequence = indexSequences[i];
+        indexSequence.clear();
+        for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
+            if (input(0, inMBLayout->GetColumnIndex(seq, t))) // this is the condition check that this node performs; the meat
+                indexSequence.push_back(t);
+        // Note: The above accesses m_value directly on the CPU, putting it into BOTH state, possibly for other consumers as well.
+    }
+    // create a new MBLayout
+    let& outMBLayout = GetMBLayout();
+    PackSequences(SequenceLengthVector(sequences, indexSequences), outMBLayout, /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
+    // copy to output
+    vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
+    for (size_t i = 0; i < sequences.size(); i++)
+    {
+        let& seq = outMBLayout->GetAllSequences()[i];
+        if (seq.seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN
+            continue;
+        let& indexSequence = indexSequences[i];
+        for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
+            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
+    }
+    Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), Input(0)->Value().GetDeviceId(), buf.data(), MatrixFormat::matrixFormatColMajor);
+}
+
+template <class ElemType>
+/*virtual*/ void WhereNode<ElemType>::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/
+{
+    // we cannot backprop through a condition
+    // Can we?
+    return;
+}
+
+template <class ElemType>
+/*virtual*/ void WhereNode<ElemType>::Validate(bool isFinalValidationPass) /*override*/
+{
+    ComputationNodeBase::Validate(isFinalValidationPass);
+    // we generate its own MBLayout
+    if (isFinalValidationPass && !Input(0)->HasMBLayout())
+        InvalidArgument("%ls %ls operation can only operate on minibatch data (which have a layout).", NodeName().c_str(), OperationName().c_str());
+    if (!m_pMBLayout)
+        m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
+    // we map scalars to scalars
+    if (isFinalValidationPass && Input(0)->GetSampleLayout().GetNumElements() != 1)
+        InvalidArgument("%ls %ls operation can only operate on scalar input.", NodeName().c_str(), OperationName().c_str());
+    SetDims(TensorShape(1), true);
+}
+
+template class WhereNode<float>;
+template class WhereNode<double>;
+
+}}}
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index f5f60309776c..273d8260ea9f 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -561,20 +561,8 @@ class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
         Input(0)->GradientFor(fr).AddToRowRepeatValuesOf(GradientFor(fr), m_numRepeat);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The RowRepeatNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The RowRepeatNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
 private:
     size_t m_numRepeat;
@@ -584,19 +572,49 @@ template class RowRepeatNode<float>;
 template class RowRepeatNode<double>;
 
 // -----------------------------------------------------------------------
-// DiagonalNode -- extract diagonal elements of a square matrix into a row vector
+// WhereNode -- extract indices of non-0 values in a sequence
+// As this implies a runtime-vale dependent reduction in dimension, it can
+// only be applied to time sequences, and not other tensor dimensions.
+// The result will have a different MBLayout reflecting the shortened result sequences.
 // -----------------------------------------------------------------------
 
 template <class ElemType>
-class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
+class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"Where"; }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(WhereNode);
+    WhereNode(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name)
     {
-        return L"Diagonal";
+        m_learningRateMultiplier = 0.0f;    // we cannot backprop; this will disable it
+        // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those.
+        //       This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters.
     }
 
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
+    virtual void Validate(bool isFinalValidationPass) override;
+
+private:
+    // buffers for creating the result sequences (kept as object state to avoid memory allocations)
+    std::vector<std::vector<size_t>>   m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences
+    std::vector<size_t>               m_rowAllocationsBuffer; // [row] for determining new MBLayout packing
+    std::vector<std::pair<size_t, size_t>> m_placementBuffer; // [sequenceIndex] assigned location for a sequence
+};
+
+// -----------------------------------------------------------------------
+// DiagonalNode -- extract diagonal elements of a square matrix into a row vector
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"Diagonal"; }
+
 public:
     DeclareConstructorFromConfigWithNumInputs(DiagonalNode);
     DiagonalNode(DEVICEID_TYPE deviceId, const wstring& name)
@@ -642,7 +660,7 @@ class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInput
         m_pMBLayout = nullptr;
 
         if (isFinalValidationPass && Input(0)->HasMBLayout())
-            InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str());
+            InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout).", NodeName().c_str(), OperationName().c_str());
 
         size_t dim = Input(0)->GetAsMatrixNumCols();
         if (isFinalValidationPass && dim != Input(0)->GetAsMatrixNumRows())
diff --git a/Source/ComputationNetworkLib/SpecialPurposeNodes.h b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
index 438cb48fe790..20f3847ceaea 100644
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@@ -106,20 +106,8 @@ class GMMLogLikelihoodNode : public ComputationNode<ElemType>, public NumInputs<
         }
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The GMMLogLikelihoodNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The GMMLogLikelihoodNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     void BackpropToUnnormedPrior(Matrix<ElemType>& unnormedPriorGradientValues, const Matrix<ElemType>& gradientValues,
                                  const Matrix<ElemType>& prior, const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h
index 8ffeb4e5197a..1450f1787dae 100644
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@@ -1454,20 +1454,8 @@ class DropoutNode : public ComputationNode<ElemType>, public NumInputs<1>
             sliceInput0Grad += sliceOutputGrad;
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The DropoutNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
-    {
-        // The DropoutNode does not require any of it's input's values for computing
-        // the gradients of its input nodes
-        UNREFERENCED_PARAMETER(childIndex);
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     virtual void UpdateFunctionMBSize() override
     {
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 4e2211c57f3e..e7b20d426e4d 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -852,16 +852,10 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
     {
         Resize(numRows, numCols);
 
-        if (IsEmpty())
+        if (!IsEmpty())
         {
-            InvalidArgument("NumRows or NumCols is 0. Nothing to copy");
-        }
-        else
-        {
-            if (!(matrixFlags & matrixFormatRowMajor)) // compatible to internal structure
-            {
+            if (!(matrixFlags & matrixFormatRowMajor)) // compatible with internal structure
                 memcpy(m_pArray, pArray, GetNumElements() * sizeof(ElemType));
-            }
             else // need to transpose
             {
                 auto& us = *this;
@@ -900,9 +894,6 @@ void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, E
 template <class ElemType>
 void CPUMatrix<ElemType>::SetDiagonalValue(const ElemType v)
 {
-    if (IsEmpty())
-        LogicError("SetDiagonalValue: Matrix is empty.");
-
     if (GetNumRows() != GetNumCols())
         LogicError("SetDiagonalValue: NumRows and NumCols do not agree.");
 
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index aabf295f95e5..a900d0042dbb 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -124,27 +124,12 @@ class MATH_API Matrix : public MatrixBase
     void ShallowCopyFrom(const Matrix<ElemType>& other);
 
 public:
-    MatrixType GetMatrixType() const
-    {
-        return m_matrixType;
-    }
-    MatrixFormat GetFormat() const
-    {
-        return m_baseMatrix->GetFormat();
-    }
-    bool OwnBuffer() const
-    {
-        return m_baseMatrix->OwnBuffer();
-    }
+    MatrixType GetMatrixType() const { return m_matrixType; }
+    MatrixFormat GetFormat() const { return m_baseMatrix->GetFormat(); }
+    bool OwnBuffer() const { return m_baseMatrix->OwnBuffer(); }
     int GetDeviceId() const; // -1 if CPU, otherwise GPU CUDA device id
-    DEVICEID_TYPE GetPreferredDeviceId() const
-    {
-        return m_preferredDeviceId;
-    }; // -1 if CPU, otherwise GPU CUDA device id
-    void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId)
-    {
-        m_preferredDeviceId = preferredDeviceId;
-    }
+    DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; // -1 if CPU, otherwise GPU CUDA device id
+    void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) { m_preferredDeviceId = preferredDeviceId; }
     // Moves matrix from device id_from to device with id_to.
     // If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
     void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const;
@@ -235,12 +220,12 @@ class MATH_API Matrix : public MatrixBase
     void SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format = matrixFormatSparseCSR); // BUGBUG: default for 'format' is unexpected
     void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal);
     void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely
-    void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l)
+    void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) // SetValue(2,3, {1,2,3,  4,5,6});
     {
         std::vector<ElemType> vals(l);
         assert(vals.size() == numRows * numCols);
         SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor);
-    } // SetValue(2,3, {1,2,3,  4,5,6});
+    }
     static ElemType MakeNan(size_t payload);
     void Invalidate()
     {
@@ -271,35 +256,35 @@ class MATH_API Matrix : public MatrixBase
     Matrix<ElemType>& AssignTransposeOf(const Matrix<ElemType>& a);
 
     Matrix<ElemType>& operator+=(const ElemType alpha);
-    Matrix<ElemType> operator+(const ElemType alpha) const;
+    Matrix<ElemType>  operator+(const ElemType alpha) const;
     Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);
 
     Matrix<ElemType>& operator+=(const Matrix<ElemType>& a);
-    Matrix<ElemType> operator+(const Matrix<ElemType>& a) const;
+    Matrix<ElemType>  operator+(const Matrix<ElemType>& a) const;
     Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
 
     Matrix<ElemType>& operator-=(const ElemType alpha);
-    Matrix<ElemType> operator-(const ElemType alpha) const;
+    Matrix<ElemType>  operator-(const ElemType alpha) const;
     Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
     Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);
 
     Matrix<ElemType>& operator-=(const Matrix<ElemType>& a);
-    Matrix<ElemType> operator-(const Matrix<ElemType>& a) const;
+    Matrix<ElemType>  operator-(const Matrix<ElemType>& a) const;
     Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
 
     Matrix<ElemType>& operator*=(const ElemType alpha);
-    Matrix<ElemType> operator*(const ElemType alpha) const;
+    Matrix<ElemType>  operator*(const ElemType alpha) const;
     Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);
 
-    Matrix<ElemType> operator*(const Matrix<ElemType>& a) const;
+    Matrix<ElemType>  operator*(const Matrix<ElemType>& a) const;
     Matrix<ElemType>& AssignProductOf(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB); // this = a * b
     Matrix<ElemType>& Assign1x1ProductOf(const Matrix<ElemType>& a1x1, const Matrix<ElemType>& b);                                         // this = a * b, where a is 1x1
 
     Matrix<ElemType>& operator/=(ElemType alpha);
-    Matrix<ElemType> operator/(ElemType alpha) const;
+    Matrix<ElemType>  operator/(ElemType alpha) const;
 
     Matrix<ElemType>& operator^=(ElemType alpha);     // element-wise power
-    Matrix<ElemType> operator^(ElemType alpha) const; // element-wise power
+    Matrix<ElemType>  operator^(ElemType alpha) const; // element-wise power
     Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);
 
     // TODO: There are several functions below that perform an in-place operation

From d9fecc5fbc779ce7f7a677267d6059e5da1af0d9 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 11 Mar 2016 17:00:13 -0800
Subject: [PATCH 09/26] WhereNode should keep its result on the CPU

---
 Source/ComputationNetworkLib/ReshapingNodes.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index dc1734a44987..36e93d943f2b 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -143,7 +143,8 @@ template <class ElemType>
         for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
             buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
     }
-    Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), Input(0)->Value().GetDeviceId(), buf.data(), MatrixFormat::matrixFormatColMajor);
+    // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
+    Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
 }
 
 template <class ElemType>

From a15d8d86a2911155934215e8a19432a247f2b047 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Fri, 11 Mar 2016 18:23:09 -0800
Subject: [PATCH 10/26] added new node PackedIndex(); bug fix in stock output
 writer, must skip gap sequences

---
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  4 ++
 Source/Common/Include/Sequences.h             |  9 ++-
 .../ComputationNetworkBuilder.cpp             |  4 +-
 .../ComputationNetworkLib/ComputationNode.cpp |  6 +-
 .../ComputationNetworkLib/ReshapingNodes.cpp  | 57 ++++++++++++++++++-
 Source/ComputationNetworkLib/ReshapingNodes.h | 37 +++++++++++-
 6 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 97498816251d..9ae9ef974646 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -64,12 +64,14 @@ Delay = PastValue
 BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
+// TODO: ColumnElementTimes = ElementTimes
 CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
 CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ]
 CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
 CrossEntropyWithSoftmax(labelVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (labelVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
+// TODO: DiagTimes = ElementTimes
 Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ]
 ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
 ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ]
@@ -84,11 +86,13 @@ MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ;
 Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; inputs = dataVectorSequence /*plus the function args*/ ]
 Minus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Minus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
+PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
 PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
 PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
 Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
+// TODO: Scale = ElementTimes
 Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ]
 Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
 Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ]
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 98f98aa51866..72b2c02ffdd7 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -328,7 +328,7 @@ struct MBLayout
     }
 
     // find a sequence by its id
-    const SequenceInfo &FindSequence(UniqueSequenceId seqId) const
+    const SequenceInfo& FindSequence(UniqueSequenceId seqId) const
     {
         for (const auto &seqInfo : m_sequences)
             if (seqInfo.seqId == seqId)
@@ -373,11 +373,14 @@ struct MBLayout
     {
         if (t > seq.GetNumTimeSteps())
             LogicError("GetColumnIndex: t out of sequence bounds.");
-        ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin;
+        if (seq.s > GetNumParallelSequences())
+            LogicError("GetColumnIndex: seq.s out of sequence bounds."); // can only happen if 'seq' does not come out of our own m_sequences array, which is verboten
+        ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin;       // shifted time index
         if (tIn < 0 || (size_t)tIn >= GetNumTimeSteps())
             LogicError("GetColumnIndex: Attempted to access a time step that is accessing a portion of a sequence that is not included in current minibatch."); // we may encounter this for truncated BPTT
         size_t col = (size_t)tIn * GetNumParallelSequences() + seq.s;
-        return (size_t)col;
+        assert(col < GetNumCols());
+        return col;
     }
 
 private:
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index a623d8da72bd..3a99c5f7fce5 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -67,10 +67,11 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(MinusNode))                            return New<MinusNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(NegateNode))                           return New<NegateNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(NoiseContrastiveEstimationNode))       return New<NoiseContrastiveEstimationNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(PackedIndexNode))                      return New<PackedIndexNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(PastValueNode))                        return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode))       return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode))     return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(TransposeDimensionsNode))                return New<TransposeDimensionsNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(TransposeDimensionsNode))              return New<TransposeDimensionsNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(PlusNode))                             return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(ReconcileMBLayoutNode))                return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(RectifiedLinearNode))                  return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
@@ -99,6 +100,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     // legacy names we also support for back compat of model-files
     else if (nodeType == L"ColumnElementTimes")                                 return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == L"Delay")                                              return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
+    // TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary.
     else if (nodeType == L"PerDimMeanVarNormalizationNode")                     return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == L"PerDimMeanVarDeNormalizationNode")                   return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == L"RowElementTimes")                                    return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 387ee95969fa..01d398e762b8 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -318,6 +318,8 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
     for (size_t s = 0; s < sequences.size(); s++)
     {
         const auto& seqInfo = sequences[s];
+        if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print
+            continue;
         size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0;
         size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width;
 
@@ -370,7 +372,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                 fprintfOrDie(f, "%s", sampleSeparator.c_str());
             if (j == jstop)
             {
-                fprintf(f, "... (%d more)", (int)(jend - jstop)); // 'nuff said
+                fprintf(f, "...+%d", (int)(jend - jstop)); // 'nuff said
                 break;
             }
             for (size_t i = 0; i < iend; i++)
@@ -379,7 +381,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                     fprintfOrDie(f, "%s", elementSeparator.c_str());
                 if (i == istop)
                 {
-                    fprintf(f, "...");
+                    fprintf(f, "...+%d", (int)(iend - istop));
                     break;
                 }
                 else if (formatChar == 'f') // print as real number
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index 36e93d943f2b..3bef16b20063 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -144,7 +144,7 @@ template <class ElemType>
             buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
     }
     // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
-    Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
+    Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
 }
 
 template <class ElemType>
@@ -173,4 +173,59 @@ template <class ElemType>
 template class WhereNode<float>;
 template class WhereNode<double>;
 
+template <class ElemType>
+/*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
+{
+    let& targetMBLayout = Input(TARGETDATA)->GetMBLayout(); // only used for index conversion
+    let& indexMBLayout  = Input(INDEXDATA)->GetMBLayout();
+    let&  index  = Input(INDEXDATA)->Value(); // per-seq index values that are to be mapped
+    auto& result =                   Value(); // packed index values as mapped to targetData's layout
+    // loop over targetSequences
+    // Input matrix contains time indices for each sequence that refer to frames inside that sequence.
+    // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
+    let& targetSequences = targetMBLayout->GetAllSequences();
+    for (size_t i = 0; i < targetSequences.size(); i++)
+    {
+        let& targetSeq = targetSequences[i];
+        if (targetSeq.seqId == GAP_SEQUENCE_ID)
+            continue;
+        let& indexSeq = indexMBLayout->FindSequence(targetSeq.seqId);          // find corresponding entry in indexMBLayout
+        for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence
+        {
+            let jIndex  = indexMBLayout->GetColumnIndex(indexSeq, tIndex);    // map time index to actual location in the matrix storage object
+            let tTarget = (size_t)index(0, jIndex);                           // the new time location (relative to target sequence)
+            let jTarget = targetMBLayout->GetColumnIndex(targetSeq, tTarget); // map new time index as well. This performs a range check.
+            result(0, jIndex) = (ElemType)jTarget;
+        }
+    }
+}
+
+template <class ElemType>
+/*virtual*/ void PackedIndexNode<ElemType>::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/
+{
+    // we cannot backprop through a condition
+    // Can we?
+    return;
+}
+
+template <class ElemType>
+/*virtual*/ void PackedIndexNode<ElemType>::Validate(bool isFinalValidationPass) /*override*/
+{
+    ComputationNodeBase::Validate(isFinalValidationPass);
+
+    // inherit both MBLayout and sample dimension (scalar) from indexData
+    // Because we map (per-seq) index sequence to (packed) index sequence. Target is only for index calculation.
+    m_pMBLayout = Input(INDEXDATA)->GetMBLayout();
+    if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(TARGETDATA)->HasMBLayout()))
+        LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str());
+
+    if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
+        InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str());
+
+    SetDims(Input(INDEXDATA));
+}
+
+template class PackedIndexNode<float>;
+template class PackedIndexNode<double>;
+
 }}}
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 273d8260ea9f..89faa76e01b8 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -572,7 +572,7 @@ template class RowRepeatNode<float>;
 template class RowRepeatNode<double>;
 
 // -----------------------------------------------------------------------
-// WhereNode -- extract indices of non-0 values in a sequence
+// WhereNode(cond) -- extract indices of non-0 values in a sequence
 // As this implies a runtime-vale dependent reduction in dimension, it can
 // only be applied to time sequences, and not other tensor dimensions.
 // The result will have a different MBLayout reflecting the shortened result sequences.
@@ -605,6 +605,41 @@ class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1
     std::vector<std::pair<size_t, size_t>> m_placementBuffer; // [sequenceIndex] assigned location for a sequence
 };
 
+// -----------------------------------------------------------------------
+// PackedIndexNode(targetObject, indexSequence) -- convert sequence indices
+// to internal packed column indices w.r.t. targetObject.
+// Intended use is
+//  - Gather  (cond, x) = GatherPacked  (PackedIndex (x, Where (xCond)), x)
+//  - Scatter (cond, y) = ScatterPacked (PackedIndex (y, Where (yCond)), y)
+// This maps sequence-specific time indices t to GetColumnIndex(seq,t),
+// as input for subsequent GatherPacked() or ScatterPacked() operations.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class PackedIndexNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"PackedIndex"; }
+
+    // our inputs
+    static const size_t TARGETDATA = 0;
+    static const size_t INDEXDATA  = 1;
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(PackedIndexNode);
+    PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name)
+    {
+        m_learningRateMultiplier = 0.0f;    // we cannot backprop; this will disable it
+        // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those.
+        //       This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters.
+    }
+
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
+    virtual void Validate(bool isFinalValidationPass) override;
+};
+
 // -----------------------------------------------------------------------
 // DiagonalNode -- extract diagonal elements of a square matrix into a row vector
 // -----------------------------------------------------------------------

From 2575da305ad1bd297fdaca02b08fe1d3de849913 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sat, 12 Mar 2016 16:42:32 -0800
Subject: [PATCH 11/26] implemented GatherPackedNode and ScatterPackedNode, so
 far CPU only

---
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  25 ++-
 Source/Common/Include/Sequences.h             |  61 +++++-
 .../ComputationNetworkBuilder.cpp             |   2 +
 .../ComputationNetworkLib/ComputationNode.h   |   6 +
 .../ComputationNetworkLib/ReshapingNodes.cpp  | 181 +++++++++++-------
 Source/ComputationNetworkLib/ReshapingNodes.h | 111 ++++++++---
 Source/Math/CPUMatrix.cpp                     | 101 +++++++++-
 Source/Math/CPUMatrix.h                       |  20 +-
 Source/Math/Matrix.cpp                        |  78 ++++++--
 Source/Math/Matrix.h                          |   5 +-
 10 files changed, 452 insertions(+), 138 deletions(-)

diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 9ae9ef974646..ece5ffd5412f 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -61,6 +61,7 @@ AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSu
 ColumnwiseCrossProduct = KhatriRaoProduct // deprecated 
 ClassificationError = ErrorPrediction 
 Delay = PastValue 
+
 BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
@@ -76,6 +77,7 @@ Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'D
 ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
 ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ]
 Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ]
+GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
 GMMLogLikelihood(unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence, tag='') = new ComputationNode [ operation = 'GMMLogLikelihood' ; inputs = (unnormalizedPriorVector : meansAsRows : logStdDevAsRows : dataVectorSequence) /*plus the function args*/ ]
 InvStdDev(dataVectorSequence, tag='') = new ComputationNode [ operation = 'InvStdDev' ; inputs = dataVectorSequence /*plus the function args*/ ]
 KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'KhatriRaoProduct' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
@@ -93,6 +95,7 @@ Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus'
 RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
 // TODO: Scale = ElementTimes
+ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
 Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ]
 Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
 Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ]
@@ -185,19 +188,21 @@ Sequences = [
     Map (lambda, x) = lambda (x)     // that one's easy
     # Reverse (x) is a C++ node currently called TimeReverse
 
-    Filter (pred, x) = x  // TODO: Implement this as a C++ node.
+    # Gather and Scatter
+    # We go through 3 nodes each to take advantage of x
+    Gather  (cond, x) =  GatherPacked (      PackedIndex (x, Where (cond)), x)
+    Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y)
 
     # sequence-altering LINQ-like operators
     # These generate new data packing (MBLayouts)
 
     # TakeWhile and DropWhile
     TakeWhile (predicate, x) = Filter ( _WhilePredicate (PastValue, predicate), x)
-    DropWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x)
-    # Skip, SkipWhile--same?
+    SkipWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x)
     _WhilePredicate (DelayFn, predicate, input) =
     [
-        whilePredicate = Boolean.And (DelayFn (whilePredicate, defaultHiddenActivation=Boolean.True), predicate)
-    ].whilePredicate
+        whilePredicateRec = Boolean.And (DelayFn (whilePredicateRec, defaultHiddenActivation=Boolean.True), predicate)
+    ].whilePredicateRec
     # TODO: do we need operations from the back?
 
     # First and Take
@@ -206,14 +211,18 @@ Sequences = [
     Take (N, x) = _Take (PastValue, N, x)
     _Take (DelayFn, N, x) = [
         selected = Loop._IsWithin (DelayFn, N, x)
-        out = Filter (selected, x)
+        out = Gather (selected, x)
     ].out
     Skip (N, x) = _Skip (PastValue, N, x)
     _Skip (DelayFn, N, x) = [ // TODO: merge with _Take
         selected = Loop._IsWithin (DelayFn, N, x)
-        out = Filter (!selected, x)
+        out = Gather (!selected, x)
     ].out
-    ElementAt (n, x) = First (Skip (n, x)) // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault
+    ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault
+        startMask = Skip (n, x)                     // ...000111...
+        mask = startMask - PastValue (0, startMask) // ...000100...
+        out = Gather (mask, x)
+    ]
     Single (predicate, x) = x
 
     #FirstOrDefault (x) = ? // can empty sequences exist or even be represented by CNTK?
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 72b2c02ffdd7..829c54c173ff 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -175,6 +175,60 @@ struct MBLayout
         m_writable = true;
     }
 
+    // packing algorithm
+    //  - width: maximum width of structure; set to maximum over sequence lengths
+    //  - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used)
+    //  - [out] *pMBLayout: MBLayout that describes the created packed sequence set
+    //  - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations)
+    template<typename SequenceInfoVector>
+    void InitAsPackedSequences(const SequenceInfoVector& inputSequences,
+                               /*temp buffer*/std::vector<std::pair<size_t, size_t>>& placement,
+                               /*temp buffer*/std::vector<size_t> rowAllocations)
+    {
+        placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps)
+        // determine width of MBLayout
+        size_t width = 0;
+        for (size_t i = 0; i < inputSequences.size(); i++)
+            if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+                continue;
+            else if (width < inputSequences[i].GetNumTimeSteps())
+                width = inputSequences[i].GetNumTimeSteps();
+        // allocate
+        rowAllocations.clear();             // [row] we build rows one by one
+        for (size_t i = 0; i < inputSequences.size(); i++)
+        {
+            if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+                continue;
+            let len = inputSequences[i].GetNumTimeSteps();
+            // first see if we find a row that has enough space
+            // TODO: Should we use a proper priority_queue?
+            size_t s;
+            for (s = 0; s < rowAllocations.size(); s++)
+                if (rowAllocations[s] + len <= width)
+                    break; // yep, it fits
+            // we did not find a s that fit then create a new one
+            if (s == rowAllocations.size())
+                rowAllocations.push_back(0);
+            // sequence goes to (s, rowAllocations[s])
+            placement[i] = make_pair(s, rowAllocations[s]);
+            // and allocate it
+            rowAllocations[s] += len;
+        }
+        // create MBLayout
+        Init(rowAllocations.size(), width);
+        for (size_t i = 0; i < inputSequences.size(); i++)
+        {
+            if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
+                continue;
+            size_t s, tBegin; tie
+            (s, tBegin) = placement[i];
+            AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps());
+        }
+        // need to fill the gaps as well
+        for (size_t s = 0; s < rowAllocations.size(); s++)
+            AddGap(s, (size_t)rowAllocations[s], width);
+    }
+
     // -------------------------------------------------------------------
     // accessors
     // -------------------------------------------------------------------
@@ -1003,7 +1057,7 @@ static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayou
 // 'Reduce' style operations--the criterion nodes and gradient computation--call this.
 // Warning: The layout used here must match the matrix. E.g. don't pass a child's matrix from a criterion node (use Input(x)->MaskMissing{Values,Gradient}ColumnsToZero() instead.
 template <class ElemType>
-static inline void MaskMissingColumnsTo(Matrix<ElemType> &matrixToMask, const MBLayoutPtr &pMBLayout, const FrameRange &fr, ElemType val)
+static inline void MaskMissingColumnsTo(Matrix<ElemType>& matrixToMask, const MBLayoutPtr& pMBLayout, const FrameRange& fr, ElemType val)
 {
     if (pMBLayout && pMBLayout->HasGaps(fr))
     {
@@ -1013,11 +1067,12 @@ static inline void MaskMissingColumnsTo(Matrix<ElemType> &matrixToMask, const MB
             auto matrixSliceToMask  = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
             TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
 #else
-        const auto &maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId());
+        const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId());
         auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
         auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
         matrixSliceToMask.MaskColumnsValue(maskSlice, val);
 #endif
     }
 }
-} } }
+
+}}}
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index 3a99c5f7fce5..2fed249ca72a 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -52,6 +52,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(ErrorPredictionNode))                  return New<ErrorPredictionNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(ExpNode))                              return New<ExpNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(FutureValueNode))                      return New<FutureValueNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(GatherPackedNode))                     return New<GatherPackedNode<ElemType>>(forward<_Types>(_Args)...);
 #ifdef COMING_SOON
     else if (nodeType == OperationNameOf(GMMLogLikelihoodNode))                 return New<GMMLogLikelihoodNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
@@ -79,6 +80,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(RowRepeatNode))                        return New<RowRepeatNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(RowSliceNode))                         return New<RowSliceNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(RowStackNode))                         return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ScatterPackedNode))                    return New<ScatterPackedNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(SequenceWithSoftmaxNode))              return New<SequenceWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
 #ifdef COMING_SOON
     else if (nodeType == OperationNameOf(SequenceDecoderNode))                  return New<SequenceDecoderNode<ElemType>>(forward<_Types>(_Args)...);
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index b42e1098ca17..a7a9c521db22 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -1057,6 +1057,12 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
         MaskMissingColumnsToZero(*m_gradient, m_pMBLayout, fr);
     }
 
+    // for index vectors: Invalid entries must be set to -1.
+    void MaskMissingValueColumnsTo(const FrameRange& fr, ElemType val)
+    {
+        MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, val);
+    }
+
     // for debugging, set the gaps to NaN instead (to track whether it bubbles up somewhere)
     void InvalidateMissingValueColumns(const FrameRange& fr) override final
     {
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index 3bef16b20063..eaf2217057a0 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -27,61 +27,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // Where(bitVector) -- extract indices of non-0 values in a sequence
 // -----------------------------------------------------------------------
 
-// TODO: move to MBLayout as a static method
-// packing algorithm
-//  - width: maximum width of structure; set to maximum over sequence lengths
-//  - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used)
-//  - [out] *pMBLayout: MBLayout that describes the created packed sequence set
-//  - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations)
-template<typename SequenceInfoVector>
-static void PackSequences(const SequenceInfoVector& inputSequences,
-    /*ref->out*/MBLayoutPtr pMBLayout,
-    /*temp buffer*/std::vector<std::pair<size_t, size_t>>& placement,
-    /*temp buffer*/std::vector<size_t> rowAllocations)
-{
-    placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps)
-    // determine width of MBLayout
-    size_t width = 0;
-    for (size_t i = 0; i < inputSequences.size(); i++)
-        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
-            continue;
-        else if (width < inputSequences[i].GetNumTimeSteps())
-            width = inputSequences[i].GetNumTimeSteps();
-    // allocate
-    rowAllocations.clear();             // [row] we build rows one by one
-    for (size_t i = 0; i < inputSequences.size(); i++)
-    {
-        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
-            continue;
-        let len = inputSequences[i].GetNumTimeSteps();
-        // first see if we find a row that has enough space
-        size_t s;
-        for (s = 0; s < rowAllocations.size(); s++)
-            if (rowAllocations[s] + len <= width)
-                break; // yep, it fits
-        // we did not find a s that fit then create a new one
-        if (s == rowAllocations.size())
-            rowAllocations.push_back(0);
-        // sequence goes to (s, rowAllocations[s])
-        placement[i] = make_pair(s, rowAllocations[s]);
-        // and allocate it
-        rowAllocations[s] += len;
-    }
-    // create MBLayout
-    pMBLayout->Init(rowAllocations.size(), width);
-    for (size_t i = 0; i < inputSequences.size(); i++)
-    {
-        if (inputSequences[i].seqId == GAP_SEQUENCE_ID)
-            continue;
-        size_t s, tBegin; tie
-        (s, tBegin) = placement[i];
-        pMBLayout->AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps());
-    }
-    // need to fill the gaps as well
-    for (size_t s = 0; s < rowAllocations.size(); s++)
-        pMBLayout->AddGap(s, (size_t)rowAllocations[s], width);
-}
-
 // wrapper class to pass MBLayout sequence vector to PackSequences()
 struct SequenceLengthVector
 {
@@ -131,7 +76,7 @@ template <class ElemType>
     }
     // create a new MBLayout
     let& outMBLayout = GetMBLayout();
-    PackSequences(SequenceLengthVector(sequences, indexSequences), outMBLayout, /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
+    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
     // copy to output
     vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
     for (size_t i = 0; i < sequences.size(); i++)
@@ -173,29 +118,33 @@ template <class ElemType>
 template class WhereNode<float>;
 template class WhereNode<double>;
 
+// -----------------------------------------------------------------------
+// PackedIndexNode(targetObject, indexSequence) -- map sequence
+// -----------------------------------------------------------------------
+
 template <class ElemType>
 /*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
 {
-    let& targetMBLayout = Input(TARGETDATA)->GetMBLayout(); // only used for index conversion
+    let& sourceMBLayout = Input(SOURCEDATA)->GetMBLayout(); // only used for index conversion
     let& indexMBLayout  = Input(INDEXDATA)->GetMBLayout();
     let&  index  = Input(INDEXDATA)->Value(); // per-seq index values that are to be mapped
-    auto& result =                   Value(); // packed index values as mapped to targetData's layout
-    // loop over targetSequences
+    auto& result =                   Value(); // packed index values as mapped to sourceData's layout
+    // loop over sourceSequences
     // Input matrix contains time indices for each sequence that refer to frames inside that sequence.
     // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
-    let& targetSequences = targetMBLayout->GetAllSequences();
-    for (size_t i = 0; i < targetSequences.size(); i++)
+    let& sourceSequences = sourceMBLayout->GetAllSequences();
+    for (size_t i = 0; i < sourceSequences.size(); i++)
     {
-        let& targetSeq = targetSequences[i];
-        if (targetSeq.seqId == GAP_SEQUENCE_ID)
+        let& sourceSeq = sourceSequences[i];
+        if (sourceSeq.seqId == GAP_SEQUENCE_ID)
             continue;
-        let& indexSeq = indexMBLayout->FindSequence(targetSeq.seqId);          // find corresponding entry in indexMBLayout
+        let& indexSeq = indexMBLayout->FindSequence(sourceSeq.seqId);          // find corresponding entry in indexMBLayout
         for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence
         {
             let jIndex  = indexMBLayout->GetColumnIndex(indexSeq, tIndex);    // map time index to actual location in the matrix storage object
-            let tTarget = (size_t)index(0, jIndex);                           // the new time location (relative to target sequence)
-            let jTarget = targetMBLayout->GetColumnIndex(targetSeq, tTarget); // map new time index as well. This performs a range check.
-            result(0, jIndex) = (ElemType)jTarget;
+            let tSource = (size_t)index(0, jIndex);                           // the new time location (relative to source sequence)
+            let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check.
+            result(0, jIndex) = (ElemType)jSource;
         }
     }
 }
@@ -216,7 +165,7 @@ template <class ElemType>
     // inherit both MBLayout and sample dimension (scalar) from indexData
     // Because we map (per-seq) index sequence to (packed) index sequence. Target is only for index calculation.
     m_pMBLayout = Input(INDEXDATA)->GetMBLayout();
-    if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(TARGETDATA)->HasMBLayout()))
+    if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout()))
         LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str());
 
     if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
@@ -228,4 +177,100 @@ template <class ElemType>
 template class PackedIndexNode<float>;
 template class PackedIndexNode<double>;
 
+// -----------------------------------------------------------------------
+// GatherPackedNode(packedIndex, sourceData) -- gather operation
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+/*virtual*/ void GatherPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
+{
+    Input(INDEXDATA)->MaskMissingValueColumnsTo(FrameRange(Input(INDEXDATA)->GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
+    let&  index  = Input(INDEXDATA)->Value();  // column indices to copy from
+    let&  source = Input(SOURCEDATA)->Value(); // source data to copy
+    auto& output =                    Value(); // output goes here
+    output.DoGatherColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
+}
+
+template <class ElemType>
+/*virtual*/ void GatherPackedNode<ElemType>::BackpropToNonLooping(size_t inputIndex) /*override*/
+{
+    if (inputIndex == SOURCEDATA)
+    {
+        let&  index          = Input(INDEXDATA)->Value();     // column indices to copy from
+        auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient intpu
+        auto& outputGradient =                    Gradient(); // output gradient to propagate
+        sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
+    }
+}
+
+template <class ElemType>
+/*virtual*/ void GatherPackedNode<ElemType>::Validate(bool isFinalValidationPass) /*override*/
+{
+    ComputationNodeBase::Validate(isFinalValidationPass);
+
+    // inherit MBLayout from indexData
+    m_pMBLayout = Input(INDEXDATA)->GetMBLayout();
+    if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout()))
+        LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str());
+
+    if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
+        InvalidArgument("%ls %ls operation requires the first argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str());
+
+    // inherit tensor dimension from sourceData
+    SetDims(Input(SOURCEDATA));
+}
+
+template class GatherPackedNode<float>;
+template class GatherPackedNode<double>;
+
+// -----------------------------------------------------------------------
+// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+/*virtual*/ void ScatterPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
+{
+    if (*Input(INDEXDATA)->GetMBLayout() != *Input(SOURCEDATA)->GetMBLayout())
+        InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str());
+    Input(INDEXDATA)->MaskMissingValueColumnsTo(FrameRange(Input(INDEXDATA)->GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
+    let&  index = Input(INDEXDATA)->Value();  // column indices to copy from
+    let&  source = Input(SOURCEDATA)->Value(); // source data to copy
+    auto& output =                    Value(); // output goes here
+    output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
+}
+
+template <class ElemType>
+/*virtual*/ void ScatterPackedNode<ElemType>::BackpropToNonLooping(size_t inputIndex) /*override*/
+{
+    if (inputIndex == SOURCEDATA)
+    {
+        let&  index          = Input(INDEXDATA)->Value();     // column indices to copy from
+        auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input
+        auto& outputGradient =                    Gradient(); // output gradient to propagate
+        sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
+    }
+}
+
+template <class ElemType>
+/*virtual*/ void ScatterPackedNode<ElemType>::Validate(bool isFinalValidationPass) /*override*/
+{
+    ComputationNodeBase::Validate(isFinalValidationPass);
+
+    // inherit MBLayout from layoutData (that's the only thing we use it for)
+    m_pMBLayout = Input(LAYOUTDATA)->GetMBLayout();
+    if (isFinalValidationPass && (!Input(LAYOUTDATA)->HasMBLayout() || !Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout()))
+        LogicError("%ls %ls operation requires all inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str());
+
+    if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
+        InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str());
+
+    // TODO: We also know that indexData and sourceData must have the same MBLayout. But that is checked at runtime.
+
+    // inherit tensor dimension from sourceData
+    SetDims(Input(SOURCEDATA));
+}
+
+template class ScatterPackedNode<float>;
+template class ScatterPackedNode<double>;
+
 }}}
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 89faa76e01b8..dbade27997b3 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -46,12 +46,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 template <class ElemType>
 class ReshapeNode : public UnaryElementWiseNode<ElemType>
 {
-    typedef UnaryElementWiseNode<ElemType> Base;
-    UsingUnaryElementwiseNodeBaseMembers;
-    static const std::wstring TypeName()
-    {
-        return L"Reshape";
-    }
+    typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName() { return L"Reshape"; }
 
 public:
     ReshapeNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& replacementSampleLayout = TensorShape(), int beginDim = 1, int endDim = 0)
@@ -185,12 +181,8 @@ template class ReshapeNode<double>;
 template <class ElemType>
 class ReconcileMBLayoutNode : public ComputationNode<ElemType>, public NumInputs<2>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"ReconcileMBLayout";
-    }
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"ReconcileMBLayout"; }
 
 public:
     DeclareConstructorFromConfigWithNumInputs(ReconcileMBLayoutNode);
@@ -241,13 +233,13 @@ template class ReconcileMBLayoutNode<double>;
 // -----------------------------------------------------------------------
 // RowSliceNode (input)
 // This node extracts a slice of the first tensor dimension (row).
+// TODO: Extend to specifying the axis. Time slicing would have to be done in BrainScript using Gather.
 // -----------------------------------------------------------------------
 
 template <class ElemType>
 class RowSliceNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName() { return L"RowSlice"; }
 
 public:
@@ -351,8 +343,7 @@ template class RowSliceNode<double>;
 template <class ElemType>
 class RowStackNode : public ComputationNode<ElemType> // note: not deriving from NumInputs<> like most other nodes, because this one takes a variable number of inputs
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName() { return L"RowStack"; }
 
 public:
@@ -492,12 +483,8 @@ template class RowStackNode<double>;
 template <class ElemType>
 class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"RowRepeat";
-    }
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"RowRepeat"; }
 
 public:
     RowRepeatNode(DEVICEID_TYPE deviceId, const wstring& name, size_t numRepeats = 1)
@@ -589,13 +576,12 @@ class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1
     WhereNode(DEVICEID_TYPE deviceId, const wstring& name) :
         Base(deviceId, name)
     {
-        m_learningRateMultiplier = 0.0f;    // we cannot backprop; this will disable it
-        // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those.
-        //       This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters.
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
     virtual void Validate(bool isFinalValidationPass) override;
 
 private:
@@ -610,7 +596,7 @@ class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1
 // to internal packed column indices w.r.t. targetObject.
 // Intended use is
 //  - Gather  (cond, x) = GatherPacked  (PackedIndex (x, Where (xCond)), x)
-//  - Scatter (cond, y) = ScatterPacked (PackedIndex (y, Where (yCond)), y)
+//  - Scatter (cond, y) = ScatterPacked (yCond, PackedIndex (y, Where (yCond)), y)
 // This maps sequence-specific time indices t to GetColumnIndex(seq,t),
 // as input for subsequent GatherPacked() or ScatterPacked() operations.
 // -----------------------------------------------------------------------
@@ -622,7 +608,7 @@ class PackedIndexNode : public ComputationNodeNonLooping<ElemType>, public NumIn
     static const std::wstring TypeName() { return L"PackedIndex"; }
 
     // our inputs
-    static const size_t TARGETDATA = 0;
+    static const size_t SOURCEDATA = 0;
     static const size_t INDEXDATA  = 1;
 
 public:
@@ -630,13 +616,78 @@ class PackedIndexNode : public ComputationNodeNonLooping<ElemType>, public NumIn
     PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) :
         Base(deviceId, name)
     {
-        m_learningRateMultiplier = 0.0f;    // we cannot backprop; this will disable it
-        // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those.
-        //       This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters.
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual void Validate(bool isFinalValidationPass) override;
+};
+
+// -----------------------------------------------------------------------
+// GatherPackedNode(packedIndex, sourceData) -- gather operation
+// Copies subset of samples pointed to by packedIndex from sourceData.
+// Sequence lengths are equal to those from packedIndex.
+// PackedIndex must have been created with PackedIndex() node, and is
+// otherwise opaque to users.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class GatherPackedNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"GatherPacked"; }
+
+    // our inputs
+    static const size_t INDEXDATA = 0;
+    static const size_t SOURCEDATA = 1;
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(GatherPackedNode);
+    GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override;
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; }
+    virtual void Validate(bool isFinalValidationPass) override;
+};
+
+// -----------------------------------------------------------------------
+// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation
+// Copies sourceData to sample positions pointed to by packedIndex.
+// The first arg, 'layoutData', is used only to determine sequence lengths,
+// and should be the same that was used to Where().
+// PackedIndex must have been created with PackedIndex() node, and is
+// otherwise opaque to users.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class ScatterPackedNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<3>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"ScatterPacked"; }
+
+    // our inputs
+    static const size_t LAYOUTDATA = 0;
+    static const size_t INDEXDATA  = 1;
+    static const size_t SOURCEDATA = 2;
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(ScatterPackedNode);
+    ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override;
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; }
     virtual void Validate(bool isFinalValidationPass) override;
 };
 
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index e7b20d426e4d..6b411dc4929a 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -677,6 +677,85 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignTransposeOf(const CPUMatrix<Elem
     return *this;
 }
 
+// dst[i] = src[i] * alpha + dst[i] * beta
+// scale a column vector and add it to another
+// The usual special case: If beta = 0, then dst[] is not read, and may be uninitialized or NaN.
+template <class ElemType>
+static void ScaleAndAddColumn(ElemType beta, ElemType* dst, const ElemType* src, size_t numRows, ElemType alpha)
+{
+    if (alpha != 1) // rare case: just do the full thing
+        for (size_t i = 0; i < numRows; i++)
+            dst[i] = beta * dst[i] + alpha * src[i];
+    else if (beta == 1) // used in backprop
+        for (size_t i = 0; i < numRows; i++)
+            dst[i] += src[i];
+    else if (beta == 0) // plain assignment
+        memcpy(dst, src, sizeof(ElemType) * numRows);
+    else // alpha=1, arbitrary beta: also rare case
+        for (size_t i = 0; i < numRows; i++)
+            dst[i] = beta * dst[i] + src[i];
+}
+
+// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const CPUMatrix<ElemType>& m, const CPUMatrix<ElemType>& a, ElemType alpha)
+{
+    if (m.GetNumRows() != 1) // index is 1-dimensional only
+        InvalidArgument("DoGatherColumnsOf: Map must be a row vector.");
+
+    if (beta)
+        VerifySize(a.GetNumRows(), m.GetNumCols());
+    else
+        Resize(a.GetNumRows(), m.GetNumCols());
+
+    auto& us = *this;
+//#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
+    foreach_column(jOut, us)
+    {
+        auto jInF = m(0, jOut); // this is the column we need to get
+        if (jInF < 0)           // negative index means gap
+            continue;
+        size_t jIn = (size_t)jInF;
+        if (jIn >= a.GetNumCols())
+            InvalidArgument("DoGatherColumnsOf: Map out of bounds.");
+        ScaleAndAddColumn(beta, &us(0,jOut), &a(0,jIn), us.GetNumRows(), alpha);
+    }
+
+    return *this;
+}
+
+// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& m, const CPUMatrix<ElemType>& a, ElemType alpha)
+{
+    if (m.GetNumRows() != 1) // index is 1-dimensional only
+        InvalidArgument("DoScatterColumnsOf: Map must be a row vector.");
+    if (m.GetNumCols() != a.GetNumCols())
+        InvalidArgument("DoScatterColumnsOf: Map must have width of input vector.");
+    if (a.GetNumRows() != GetNumRows())
+        InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector.");
+
+    auto& us = *this;
+
+    // pre-scale with beta upfront
+    // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
+    Scale(beta, us); // if beta is 0, then this will be a memset()
+
+#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
+    foreach_column(jIn, a)
+    {
+        auto jOutF = m(0, jIn); // this is the column we copy/add into
+        if (jOutF < 0)          // negative index means gap
+            continue;
+        size_t jOut = (size_t)jOutF;
+        if (jOut >= GetNumCols())
+            InvalidArgument("DoGatherColumnsOf: Map out of bounds.");
+        ScaleAndAddColumn(beta, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha);
+    }
+
+    return *this;
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::SetValue(const ElemType v)
 {
@@ -4629,7 +4708,7 @@ void CPUMatrix<ElemType>::AssignScaledDifference(const CPUMatrix<ElemType>& alph
 /// <param name="a">Input matrix</param>
 /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
 template <class ElemType>
-void CPUMatrix<ElemType>::Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c)
+/*static*/ void CPUMatrix<ElemType>::Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c)
 {
     if (a.IsEmpty())
         LogicError("Scale:  Input matrix a is empty.");
@@ -4640,6 +4719,12 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CP
     assert(m > 0 && n > 0); // converting from size_t to int may cause overflow
     c.Resize(m, n);
 
+    if (alpha == 0)
+    {
+        memset(c.m_pArray, 0, sizeof(ElemType) * c.GetNumElements());
+        return;
+    }
+
     long size = (long) c.GetNumElements();
 #pragma omp parallel for
     // four-way unrolling
@@ -4650,7 +4735,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CP
         c.m_pArray[i + 2] = alpha * a.m_pArray[i + 2];
         c.m_pArray[i + 3] = alpha * a.m_pArray[i + 3];
     }
-    // handle remaining stuffs
+    // remaining elements
     for (long i = size & ~3; i < size; i++)
     {
         c.m_pArray[i] = alpha * a.m_pArray[i];
@@ -4661,7 +4746,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CP
 /// <param name="alpha">Scalar</param>
 /// <param name="a">Input matrix</param>
 template <class ElemType>
-void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
+/*static*/ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
 {
     if (a.IsEmpty())
         LogicError("Scale:  Input matrix a is empty.");
@@ -4673,10 +4758,14 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
 
     assert(m > 0 && n > 0 && len > 0); // converting from size_t to int may cause overflow
 
-    if (sizeof(ElemType) == sizeof(double))
+    if (alpha == 0 && incx == 1)
+    {
+        memset(a.m_pArray, 0, sizeof(ElemType) * len);
+    }
+    else if (sizeof(ElemType) == sizeof(double))
     {
 #ifdef USE_ACML
-        dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
+        dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx); // TODO: Use overloads.
 #else
         cblas_dscal(len, alpha, reinterpret_cast<double*>(a.m_pArray), incx);
 #endif
@@ -4696,7 +4785,7 @@ void CPUMatrix<ElemType>::Scale(ElemType alpha, CPUMatrix<ElemType>& a)
 /// <param name="alpha">1x1 matrix</param>
 /// <param name="a">Input matrix</param>
 template <class ElemType>
-void CPUMatrix<ElemType>::Scale(CPUMatrix<ElemType> alpha, CPUMatrix<ElemType>& a)
+/*static*/ void CPUMatrix<ElemType>::Scale(CPUMatrix<ElemType> alpha, CPUMatrix<ElemType>& a)
 {
     if (a.IsEmpty())
         LogicError("Scale:  Input matrix a is empty.");
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 2a8f7dbd3619..3de2abfb1825 100644
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -53,6 +53,7 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     using B::GetNumRows;
     using B::GetNumCols;
     using B::SetOwnBuffer;
+    using B::VerifySize;
 
     size_t BufferSize() const
     {
@@ -120,35 +121,38 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     CPUMatrix<ElemType> Transpose();
     CPUMatrix<ElemType>& AssignTransposeOf(const CPUMatrix<ElemType>& a);
 
+    CPUMatrix<ElemType>& DoGatherColumnsOf (ElemType beta, const CPUMatrix<ElemType>& m, const CPUMatrix<ElemType>& a, ElemType alpha);
+    CPUMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType>& m, const CPUMatrix<ElemType>& a, ElemType alpha);
+
     CPUMatrix<ElemType>& operator+=(const ElemType alpha);
-    CPUMatrix<ElemType> operator+(const ElemType alpha) const;
+    CPUMatrix<ElemType>  operator+(const ElemType alpha) const;
     CPUMatrix<ElemType>& AssignSumOf(const ElemType alpha, const CPUMatrix<ElemType>& a);
 
     CPUMatrix<ElemType>& operator+=(const CPUMatrix<ElemType>& a);
-    CPUMatrix<ElemType> operator+(const CPUMatrix<ElemType>& a) const;
+    CPUMatrix<ElemType>  operator+(const CPUMatrix<ElemType>& a) const;
     CPUMatrix<ElemType>& AssignSumOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
 
     CPUMatrix<ElemType>& operator-=(const ElemType alpha);
-    CPUMatrix<ElemType> operator-(const ElemType alpha) const;
+    CPUMatrix<ElemType>  operator-(const ElemType alpha) const;
     CPUMatrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const CPUMatrix<ElemType>& a);
     CPUMatrix<ElemType>& AssignDifferenceOf(const CPUMatrix<ElemType>& a, const ElemType alpha);
 
     CPUMatrix<ElemType>& operator-=(const CPUMatrix<ElemType>& a);
-    CPUMatrix<ElemType> operator-(const CPUMatrix<ElemType>& a) const;
+    CPUMatrix<ElemType>  operator-(const CPUMatrix<ElemType>& a) const;
     CPUMatrix<ElemType>& AssignDifferenceOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
 
     CPUMatrix<ElemType>& operator*=(const ElemType alpha);
-    CPUMatrix<ElemType> operator*(const ElemType alpha) const;
+    CPUMatrix<ElemType>  operator*(const ElemType alpha) const;
     CPUMatrix<ElemType>& AssignProductOf(const ElemType alpha, const CPUMatrix<ElemType>& a);
 
-    CPUMatrix<ElemType> operator*(const CPUMatrix<ElemType>& a) const;
+    CPUMatrix<ElemType>  operator*(const CPUMatrix<ElemType>& a) const;
     CPUMatrix<ElemType>& AssignProductOf(const CPUMatrix<ElemType>& a, const bool transposeA, const CPUMatrix<ElemType>& b, const bool transposeB);
 
     CPUMatrix<ElemType>& operator/=(ElemType alpha);
-    CPUMatrix<ElemType> operator/(ElemType alpha) const;
+    CPUMatrix<ElemType>  operator/(ElemType alpha) const;
 
     CPUMatrix<ElemType>& operator^=(ElemType alpha);     // element-wise power
-    CPUMatrix<ElemType> operator^(ElemType alpha) const; // element-wise power
+    CPUMatrix<ElemType>  operator^(ElemType alpha) const; // element-wise power
     CPUMatrix<ElemType>& AssignElementPowerOf(const CPUMatrix<ElemType>& a, const ElemType power);
 
     CPUMatrix<ElemType>& ElementMultiplyWith(const CPUMatrix<ElemType>& a);
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 4c425586bc17..d884aba884b4 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -1069,6 +1069,43 @@ Matrix<ElemType>& Matrix<ElemType>::AssignTransposeOf(const Matrix<ElemType>& a)
     return *this;
 }
 
+// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta
+// m has width of 'this' and contains values w.r.t. 'a'
+// Invalid entries (gap columns) are denoted by m(0,j) == -1.
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha)
+{
+    DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0
+
+    DISPATCH_MATRIX_ON_FLAG(&a,
+        this,
+        m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
+        NOT_IMPLEMENTED, //m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED);
+
+    return *this;
+}
+
+// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta
+// m has width of 'a' and contains values w.r.t. 'this'
+// Unlike gather, for scatter, 'this' must have been sized already.
+// Invalid entries (gap columns) are denoted by m(0,j) == -1.
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha)
+{
+    DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0
+
+    DISPATCH_MATRIX_ON_FLAG(&a,
+        this,
+        m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
+        NOT_IMPLEMENTED, //m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED);
+
+    return *this;
+}
+
 // set all elements of a matrix to a scalar value
 // For sparse matrices, the only allowed value is 0.
 template <class ElemType>
@@ -1318,7 +1355,7 @@ void Matrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemTyp
         InvalidArgument("SetUniformRandomValue: sigma must be a positive value.");
 
     if (IsEmpty())
-        LogicError("SetUniformRandomValue: Matrix is empty.");
+        return;
 
     DISPATCH_MATRIX_ON_FLAG(this,
                             this,
@@ -3424,6 +3461,9 @@ int Matrix<ElemType>::GetDeviceId() const
                             return m_GPUSparseMatrix->GetComputeDeviceId());
 }
 
+// TODO: Move the shared core functions to the front of this source file.
+// BUGBUG: This performs a copy operation even for the output matrix that gets overwritten right away.
+//         We should (1) define which is the output and (2) whether it will be completely overwritten (so we won't actually copy it).
 // bring two matrices onto the same device
 // If different and prefered devices are the same, move to preferred device.
 // Otherwise GPU takes precedence over CPU, and if both are GPU move to a's device.
@@ -4454,13 +4494,21 @@ template <class ElemType>
 void Matrix<ElemType>::Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c)
 {
     DecideAndMoveToRightDevice(c, a);
+
     c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);
 
-    DISPATCH_MATRIX_ON_FLAG(&c,
-                            &c,
-                            CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
-                            GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
-                            NOT_IMPLEMENTED, * c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix) * alpha);
+    if (alpha == 0)
+    {
+        c.Resize(a);
+        c.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters
+        return;
+    }
+    else
+        DISPATCH_MATRIX_ON_FLAG(&c,
+                                &c,
+                                CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
+                                GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
+                                NOT_IMPLEMENTED, * c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix) * alpha);
 }
 
 /// <summary>Matrix-scalar multiply with col-major matrices: a = alpha * a</summary>
@@ -4469,15 +4517,17 @@ void Matrix<ElemType>::Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<E
 template <class ElemType>
 void Matrix<ElemType>::Scale(ElemType alpha, Matrix<ElemType>& a)
 {
-    if (a.IsEmpty())
+    if (alpha == 0)
+        a.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters
+    else if (a.IsEmpty())
         return;
-
-    DISPATCH_MATRIX_ON_FLAG(&a,
-                            &a,
-                            CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix),
-                            GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix),
-                            NOT_IMPLEMENTED,
-                            GPUSparseMatrix<ElemType>::Scale(alpha, *a.m_GPUSparseMatrix));
+    else
+        DISPATCH_MATRIX_ON_FLAG(&a,
+                                &a,
+                                CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix),
+                                GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix),
+                                NOT_IMPLEMENTED,
+                                GPUSparseMatrix<ElemType>::Scale(alpha, *a.m_GPUSparseMatrix));
 }
 
 /// <summary>Matrix scalar matrix multiply with col-major matrices: a = alpha[0,0] * a</summary>
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index a900d0042dbb..d0540a6f4640 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -176,7 +176,7 @@ class MATH_API Matrix : public MatrixBase
     ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
 
     void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); // by default we only reallocate if need to grow
-    void Resize(const Matrix<ElemType>& other)
+    void Resize(const Matrix<ElemType>& other) // TODO: Should this carry over numNZElemToReserve for sparse matrices?
     {
         Resize(other.GetNumRows(), other.GetNumCols());
     }
@@ -255,6 +255,9 @@ class MATH_API Matrix : public MatrixBase
     Matrix<ElemType> Transpose(); // This method doesn't change state of Matrix. It should be a const function
     Matrix<ElemType>& AssignTransposeOf(const Matrix<ElemType>& a);
 
+    Matrix<ElemType>& DoGatherColumnsOf (ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha);
+    Matrix<ElemType>& DoScatterColumnsOf(ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha);
+
     Matrix<ElemType>& operator+=(const ElemType alpha);
     Matrix<ElemType>  operator+(const ElemType alpha) const;
     Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);

From 0c62fb9f7f0c84aaf5205f9e2ebe72d197487b60 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sat, 12 Mar 2016 19:04:45 -0800
Subject: [PATCH 12/26] GPU version of Gather() and Scatter()

---
 Source/Common/Include/Sequences.h             |   9 +-
 .../ComputationNetworkLib/ReshapingNodes.cpp  |   1 +
 Source/Math/CPUMatrix.cpp                     |   4 +-
 Source/Math/GPUMatrix.cu                      | 134 ++++++++++++++++--
 Source/Math/GPUMatrix.h                       |   3 +
 Source/Math/Matrix.cpp                        |  36 ++---
 Source/Math/Matrix.h                          |  11 +-
 7 files changed, 162 insertions(+), 36 deletions(-)

diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 829c54c173ff..ed63912fe8a9 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -1062,12 +1062,13 @@ static inline void MaskMissingColumnsTo(Matrix<ElemType>& matrixToMask, const MB
     if (pMBLayout && pMBLayout->HasGaps(fr))
     {
 #if 0 // in the future we can use the tensor lib to implement this
-            const auto & maskMatrix = pMBLayout->GetColumnsValidMask<ElemType>();
-            auto maskSlice          = DataWithMBLayoutFor(maskMatrix,   fr, pMBLayout);
-            auto matrixSliceToMask  = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
-            TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
+        const auto & maskMatrix = pMBLayout->GetColumnsValidMask<ElemType>();
+        auto maskSlice          = DataWithMBLayoutFor(maskMatrix,   fr, pMBLayout);
+        auto matrixSliceToMask  = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
+        TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
 #else
         const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId());
+        maskMatrix.TransferToDeviceIfNotThere(matrixToMask.GetDeviceId(), /*ismoved=*/ false, /*emptyTransfer=*/ false, /*updatePreferredDevice=*/ false);
         auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
         auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
         matrixSliceToMask.MaskColumnsValue(maskSlice, val);
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index eaf2217057a0..9f07164d63b2 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -89,6 +89,7 @@ template <class ElemType>
             buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
     }
     // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
+    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
     Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
 }
 
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 6b411dc4929a..e0fd090ab25a 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -709,7 +709,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
         Resize(a.GetNumRows(), m.GetNumCols());
 
     auto& us = *this;
-//#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
+#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
     foreach_column(jOut, us)
     {
         auto jInF = m(0, jOut); // this is the column we need to get
@@ -750,7 +750,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
         size_t jOut = (size_t)jOutF;
         if (jOut >= GetNumCols())
             InvalidArgument("DoGatherColumnsOf: Map out of bounds.");
-        ScaleAndAddColumn(beta, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha);
+        ScaleAndAddColumn(/*beta=*/(ElemType)1, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha);
     }
 
     return *this;
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 8c31ec4c9a50..f0f3fbc4e3e7 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -918,11 +918,120 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf(const GPUMatrix<Elem
     return *this;
 }
 
+template <class ElemType>
+__global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType beta, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, size_t aCols, const ElemType alpha)
+{
+    size_t i    = threadIdx.x; // index into 'us' and 'a'
+    size_t jOut =  blockIdx.x; // index into 'us' and 'm'
+
+    auto jInF = m[jOut * mStride]; // this is the column we need to get
+    if (jInF < 0)                  // negative index means gap
+        return;
+    size_t jIn = (size_t)jInF;
+    if (jIn >= aCols)
+        return; // actually a failure
+
+    const ElemType& ra  =  a[i + jIn  *  aStride];
+    ElemType&       rus = us[i + jOut * usStride];
+
+    ElemType res = ra * alpha;
+    if (beta != 0)
+        res += rus * beta;
+    rus = res;
+}
+
+// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
+{
+    if (m.GetNumRows() != 1) // index is 1-dimensional only
+        InvalidArgument("DoGatherColumnsOf: Map must be a row vector.");
+
+    if (beta)
+        VerifySize(a.GetNumRows(), m.GetNumCols());
+    else
+        Resize(a.GetNumRows(), m.GetNumCols());
+
+    if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
+        InvalidArgument("All matrices must be on the same GPU");
+    a.PrepareDevice();
+
+    SyncGuard syncGuard;
+    _doGatherColumnsOf<ElemType> << <GetNumCols(), GetNumRows(), 0, t_stream >> >(m_pArray, GetNumRows(), beta, m.m_pArray, 1, a.m_pArray, a.GetNumRows(), a.GetNumCols(), alpha);
+
+    return *this;
+}
+
+template <class ElemType>
+__global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, const ElemType alpha)
+{
+    size_t i   = threadIdx.x; // index into 'a' and 'us'
+    size_t jIn =  blockIdx.x; // index into 'a' and 'm'
+
+    auto jOutF = m[jIn * mStride]; // this is the column we copy/add into
+    if (jOutF < 0)                 // negative index means gap
+        return;
+    size_t jOut = (size_t)jOutF;
+    if (jOut >= usCols)
+        return; // actually a failure
+
+    const ElemType& ra  =  a[i + jIn  *  aStride];
+    ElemType&       rus = us[i + jOut * usStride];
+
+    ElemType res = ra * alpha;
+#if 0 // this is not the reason. Some stupid bad index.
+    rus += res;
+#else
+    atomicAdd(&rus, res);
+#endif
+    // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter())
+}
+
+// little helper for debugging
+template <class ElemType>
+static void Peek(const GPUMatrix<ElemType>& m, const char* which)
+{
+    size_t rows = m.GetNumRows();
+    size_t cols = m.GetNumCols();
+    ElemType buf[100] = { 0 };
+    size_t n = min(rows * cols, _countof(buf));
+    cudaMemcpy(buf, m.BufferPointer(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost);
+    UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
+}
+
+// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
+{
+    if (m.GetNumRows() != 1) // index is 1-dimensional only
+        InvalidArgument("DoScatterColumnsOf: Map must be a row vector.");
+    if (m.GetNumCols() != a.GetNumCols())
+        InvalidArgument("DoScatterColumnsOf: Map must have width of input vector.");
+    if (a.GetNumRows() != GetNumRows())
+        InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector.");
+
+    if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId())
+        InvalidArgument("All matrices must be on the same GPU");
+    a.PrepareDevice();
+
+    auto& us = *this;
+    //Peek(us, "us"); Peek(m, "m"); Peek(a, "a");
+
+    // pre-scale with beta upfront
+    // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
+    Scale(beta, us); // if beta is 0, then this will be a memset()
+
+    SyncGuard syncGuard;
+    _doScatterColumnsOf<ElemType> << <a.GetNumCols(), a.GetNumRows(), 0, t_stream >> >(m_pArray, GetNumRows(), GetNumCols(), m.m_pArray, 1, a.m_pArray, a.GetNumRows(), alpha);
+
+    return *this;
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::SetValue(const ElemType v)
 {
     if (IsEmpty())
-        LogicError("SetValue: Matrix is empty.");
+        return;
 
     CUDA_LONG N = (CUDA_LONG) GetNumElements();
 
@@ -2979,7 +3088,7 @@ void GPUMatrix<ElemType>::Multiply(const GPUMatrix<ElemType>& a, const GPUMatrix
 /// <param name="a">Input matrix</param>
 /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
 template <class ElemType>
-void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+/*static*/ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
 {
     if (a.GetComputeDeviceId() != c.GetComputeDeviceId())
     {
@@ -2987,6 +3096,8 @@ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>&
     }
     else
     {
+        if (a.IsEmpty() && c.IsEmpty())
+            return;
         a.PrepareDevice();
         if (a.IsEmpty() || c.IsEmpty())
             LogicError("ScaleAndAdd:  one of the input matrices is empty.");
@@ -3088,7 +3199,7 @@ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>&
 /// <param name="b">Input matrix</param>
 /// <param name="c">Resulting matrix, user is responsible for allocating this</param>
 template <class ElemType>
-void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
+/*static*/ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
 {
     if (a.GetComputeDeviceId() != c.GetComputeDeviceId() || a.GetComputeDeviceId() != b.GetComputeDeviceId())
     {
@@ -3096,6 +3207,8 @@ void GPUMatrix<ElemType>::ScaleAndAdd(ElemType alpha, const GPUMatrix<ElemType>&
     }
     else
     {
+        if (a.IsEmpty() && b.IsEmpty())
+            return;
         a.PrepareDevice();
         if (a.IsEmpty() || b.IsEmpty())
             LogicError("ScaleAndAdd:  one of the input matrices is empty.");
@@ -3321,8 +3434,14 @@ void GPUMatrix<ElemType>::AddElementToElement(const GPUMatrix<ElemType>& a, cons
 }
 
 template <class ElemType>
-void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
+/*static*/ void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
 {
+    if (alpha == 0) // if 0 then do not access the value, so that we can use this to multiply uninitialized matrices with beta=0
+    {
+        CUDA_CALL(cudaMemset(a.m_pArray, 0, a.m_numRows * a.m_numCols * sizeof(ElemType)));
+        return;
+    }
+
     cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
     if (sizeof(ElemType) == sizeof(float))
     {
@@ -3341,7 +3460,7 @@ void GPUMatrix<ElemType>::Scale(ElemType alpha, GPUMatrix<ElemType>& a)
 }
 
 template <class ElemType>
-void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
+/*static*/ void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& a)
 {
     if (alpha.GetNumElements() != 1)
     {
@@ -3366,11 +3485,8 @@ void GPUMatrix<ElemType>::Scale(GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>&
 }
 
 template <class ElemType> // c = alpha * a
-void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+/*static*/ void GPUMatrix<ElemType>::Scale(ElemType alpha, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
 {
-    if (a.IsEmpty())
-        LogicError("Scale:  Input matrix a is empty.");
-
     c = a;
     Scale(alpha, c);
 }
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 822db1be5f5e..39b9d74ca0ef 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -210,6 +210,9 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
     GPUMatrix<ElemType> Transpose() const;
     GPUMatrix<ElemType>& AssignTransposeOf(const GPUMatrix<ElemType>& a);
 
+    GPUMatrix<ElemType>& DoGatherColumnsOf (ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha);
+    GPUMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha);
+
     GPUMatrix<ElemType>& operator+=(const ElemType alpha);
     GPUMatrix<ElemType> operator+(const ElemType alpha) const;
     GPUMatrix<ElemType>& AssignSumOf(const ElemType alpha, const GPUMatrix<ElemType>& a);
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index d884aba884b4..4d99d26b81bf 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -1080,7 +1080,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
     DISPATCH_MATRIX_ON_FLAG(&a,
         this,
         m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
-        NOT_IMPLEMENTED, //m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
+        m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
         NOT_IMPLEMENTED,
         NOT_IMPLEMENTED);
 
@@ -1099,7 +1099,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
     DISPATCH_MATRIX_ON_FLAG(&a,
         this,
         m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
-        NOT_IMPLEMENTED, //m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
+        m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
         NOT_IMPLEMENTED,
         NOT_IMPLEMENTED);
 
@@ -1167,10 +1167,12 @@ template <class ElemType>
 void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
 {
     if (GetNumCols() != columnsMask.GetNumCols())
-        RuntimeError("Matrix and column mask must have equal number of columns");
+        RuntimeError("MaskColumnsValue: Matrix and column mask must have equal number of columns.");
 
-    if (GetDeviceId() != columnsMask.GetDeviceId())
-        RuntimeError("Matrix and column mask must be on the same device");
+    if (GetCurrentMatrixLocation() == CPU && (columnsMask.GetCurrentMatrixLocation() == CPU || columnsMask.GetCurrentMatrixLocation() == BOTH))
+        ; // OK
+    else if (GetDeviceId() != columnsMask.GetDeviceId() && columnsMask.GetCurrentMatrixLocation() != BOTH)
+        RuntimeError("MaskColumnsValue: Matrix and column mask must be on the same device.");
 
     DISPATCH_MATRIX_ON_FLAG(this,
                             this,
@@ -3470,7 +3472,8 @@ int Matrix<ElemType>::GetDeviceId() const
 // The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ.
 // TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this.
 template <class ElemType>
-void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
+template <class ElemType2>
+void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType2>& b)
 {
     int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId();
     if (deviceIdA == deviceIdB)
@@ -3541,21 +3544,21 @@ void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, con
 }
 
 template <class ElemType>
-void Matrix<ElemType>::_transferToDevice(int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const
+void Matrix<ElemType>::_transferToDevice(int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
 {
     int from_id = GetDeviceId();
     if (to_id == from_id) // nothing to do
         return;
 
     if (OwnBuffer())
-        _transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer);
+        _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
     else
         RuntimeError("Cannot move externally owned matrices to the preferred device.");
 }
 
 // this function performs data transfer and updates data location, but not the device that is stored with it
 template <class ElemType>
-void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const
+void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
 {
     if (from_id < 0)
         from_id = CPUDEVICE;
@@ -3606,7 +3609,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
                 m_GPUSparseMatrix->SetValue(*m_CPUSparseMatrix);
             }
 
-            if (ismoved)
+            if (isBeingMoved)
             {
                 delete m_CPUSparseMatrix;
                 m_CPUSparseMatrix = NULL;
@@ -3632,7 +3635,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
                     m_GPUSparseMatrix->CopyToCPUSparseMatrix(*m_CPUSparseMatrix);
                 }
 
-                if (ismoved)
+                if (isBeingMoved)
                 {
                     delete m_GPUSparseMatrix;
                     m_GPUSparseMatrix = NULL;
@@ -3666,7 +3669,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
             {
                 m_GPUMatrix = new GPUMatrix<ElemType>(to_id);
             }
-            if (ismoved)
+            if (isBeingMoved)
             {
                 delete m_CPUMatrix;
                 m_CPUMatrix = NULL;
@@ -3698,7 +3701,7 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
                     m_CPUMatrix = new CPUMatrix<ElemType>();
                 }
 
-                if (ismoved)
+                if (isBeingMoved)
                 {
                     delete m_GPUMatrix;
                     m_GPUMatrix = NULL;
@@ -3718,9 +3721,9 @@ void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool
 }
 
 template <class ElemType>
-void Matrix<ElemType>::TransferFromDeviceToDevice(int from_id, int to_id, bool ismoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
+void Matrix<ElemType>::TransferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
 {
-    _transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer);
+    _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
     if (updatePreferredDevice)
         m_preferredDeviceId = GetDeviceId();
 }
@@ -5126,7 +5129,8 @@ template char* Matrix<char>::BufferPointer() const;
 template int Matrix<char>::GetDeviceId() const;
 template size_t Matrix<char>::GetNumElements() const;
 template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
-template void Matrix<char>::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const;
+template void Matrix<char>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
+template void Matrix<char>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
 template size_t Matrix<char>::GetNumRows() const;
 template size_t Matrix<char>::GetNumCols() const;
 template void Matrix<char>::SetValue(const char);
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index d0540a6f4640..687b74b272a2 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -74,10 +74,11 @@ class MATH_API Matrix : public MatrixBase
     mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
 
     // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
-    void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved = true, bool emptyTransfer = false) const;
+    void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
     // Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
-    void _transferToDevice(int id_to, bool ismoved = true, bool emptyTransfer = false) const;
-    static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
+    void _transferToDevice(int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
+    template <class ElemType2>
+    static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType2>& b);
     static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
     static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
     static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);
@@ -132,9 +133,9 @@ class MATH_API Matrix : public MatrixBase
     void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) { m_preferredDeviceId = preferredDeviceId; }
     // Moves matrix from device id_from to device with id_to.
     // If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
-    void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const;
+    void TransferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const;
     // Same as TransferFromDeviceToDevice() but moves only if it is currently not on the target device
-    void TransferToDeviceIfNotThere(int id_to, bool ismoved = false, bool emptyTransfer = false, bool updatePreferredDevice = true) const;
+    void TransferToDeviceIfNotThere(int id_to, bool isBeingMoved = false, bool emptyTransfer = false, bool updatePreferredDevice = true) const;
     CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
     void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat, bool keepValues); // sets matrix type between dense and sparse
     size_t GetNumRows() const;

From 7b539bdbefad32d7c86c10edbc5a74e389af468d Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sun, 13 Mar 2016 09:56:07 -0700
Subject: [PATCH 13/26] (fixed NoGPU.cpp for last commit)

---
 Source/Math/NoGPU.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index 47a6c2392cf2..13993c4b6e37 100644
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -921,6 +921,18 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignTransposeOf(const GPUMatrix<Elem
     return *this;
 }
 
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
+{
+    return *this;
+}
+
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& m, const GPUMatrix<ElemType>& a, ElemType alpha)
+{
+    return *this;
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::SetValue(const ElemType v)
 {

From 31f81c5a1be5c4e4002ac2d46ad11689e925071b Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sun, 13 Mar 2016 11:40:39 -0700
Subject: [PATCH 14/26] minor refactoring of m_nameToNodeMap; new class
 ComputationEnvironment through which the network can share information with
 all nodes

---
 .../ComputationEnvironment.h                  |  32 ++++++
 .../ComputationNetwork.cpp                    |   8 +-
 .../ComputationNetwork.h                      |  57 +++++++---
 .../ComputationNetworkEditing.cpp             | 102 ++++++++++--------
 .../ComputationNetworkLib.vcxproj             |   1 +
 .../ComputationNetworkLib.vcxproj.filters     |   6 ++
 .../ComputationNetworkLib/ComputationNode.h   |  16 ++-
 Source/Math/GPUMatrix.h                       |   1 +
 8 files changed, 158 insertions(+), 65 deletions(-)
 create mode 100644 Source/ComputationNetworkLib/ComputationEnvironment.h

diff --git a/Source/ComputationNetworkLib/ComputationEnvironment.h b/Source/ComputationNetworkLib/ComputationEnvironment.h
new file mode 100644
index 000000000000..d860208d3431
--- /dev/null
+++ b/Source/ComputationNetworkLib/ComputationEnvironment.h
@@ -0,0 +1,32 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Basics.h"
+
+#include <memory>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// ===========================================================================
+// ComputationEnvironment -- computation graph and operations
+// ===========================================================================
+
+enum class NetworkOperationMode
+{
+    unspecified,
+    training,
+    inferring,
+    precomputing
+};
+
+struct ComputationEnvironment
+{
+    NetworkOperationMode networkOperationMode = NetworkOperationMode::unspecified;
+};
+typedef shared_ptr<ComputationEnvironment> ComputationEnvironmentPtr;
+
+}}}
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index c0f8d5e3464c..6898a406cdc4 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -64,7 +64,11 @@ void ComputationNetwork::ClearNetwork()
     //         Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard.
     //         Or just use weak ptrs.
     for (auto& iter : m_nameToNodeMap)
-        iter.second->DetachInputs();
+    {
+        auto& node = iter.second;
+        node->SetEnvironment(nullptr);
+        node->DetachInputs();
+    }
 
     m_nameToNodeMap.clear();
 
@@ -1022,7 +1026,7 @@ void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDCo
             redVT.ColumnElementMultiplyWith(redS);
 
             // Step 2. create two new Parameter nodes and one Times node
-            wstring leftChildName = name + L"-U";
+            wstring leftChildName = name + L"-U";  // BUGBUG: With BrainScript, node names must be proper identifieres/variable expressions. We can't have '-' in node names.
             wstring rightChildName = name + L"-V";
             shared_ptr<ComputationNode<ElemType>> pLeft = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, leftChildName, m, r));
             shared_ptr<ComputationNode<ElemType>> pRight = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, rightChildName, r, n));
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 074061ad688c..87c5b7061f89 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -12,6 +12,7 @@
 
 #include "ComputationNode.h"
 #include "ScriptableObjects.h"
+#include "ComputationEnvironment.h"
 
 #include <map>
 #include <string>
@@ -43,10 +44,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // construction
     // -----------------------------------------------------------------------
 
-    ComputationNetwork()
-        : m_randomSeedOffset(0),
-          m_isCompiled(false),
-          m_pMBLayout(make_shared<MBLayout>())
+    ComputationNetwork() :
+        m_randomSeedOffset(0),
+        m_isCompiled(false),
+        m_pMBLayout(make_shared<MBLayout>()),
+        m_environment(make_shared<ComputationEnvironment>())
     {
     }
     ComputationNetwork(DEVICEID_TYPE deviceId)
@@ -283,6 +285,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     // this counts the actual number of frames in a minibatch (not counting gaps in parallel sequences)
     // TODO: Instead of passing numAllSamples in here, we should determine it from the inputs in case of no layout. Or simply forbid this case.
+    // BUGBUG: With variable-length sequences, this can no longer be a network method.
     size_t GetNumSamplesWithLabel(const size_t numAllSamples) const
     {
         if (m_pMBLayout)
@@ -329,7 +332,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     void ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode);
     void ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode);
     void AddFeatureNode(ComputationNodeBasePtr featureNode);
-    void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
+    ComputationNodeBasePtr RemoveFeatureNode(ComputationNodeBasePtr featureNode);
     void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
     void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
 
@@ -566,15 +569,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     // TODO: move these close to where they are used
 
     // add a node to m_nameToNodeMap[], which is our node holder
+    // This only adds the node to the network's node set, without considering linkage.
     // Duplicate node names are rejected.
     ComputationNodeBasePtr AddNodeToNet(const ComputationNodeBasePtr& nodePtr)
     {
-        // found
-        // TODO: use .insert() and test result.second == false means not inserted since already exists
-        if (m_nameToNodeMap.find(nodePtr->NodeName()) != m_nameToNodeMap.end())
-            RuntimeError("Duplicated computation node name.");
-
-        m_nameToNodeMap[nodePtr->NodeName()] = nodePtr;
+        auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr));
+        if (!result.second)
+            RuntimeError("AddNodeToNet: Duplicated computation node name.");
+        nodePtr->SetEnvironment(m_environment);
         return nodePtr; // allows e.g. return AddNodeToNet(New...);
     }
     // TODO: not very nice--need to fix way more outside to get this right
@@ -592,6 +594,27 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         // return nodePtr; // allows e.g. return AddNodeToNetAndAttachInputs(New..., inputs);
     }
 
+    // add a node to the network unless it's already there
+    ComputationNodeBasePtr AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr)
+    {
+        auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr));
+        if (!result.second && result.first->second != nodePtr) // if there's already one under this name, it better be nodePtr
+            RuntimeError("AddNodeToNetIfNotYet: Duplicated computation node name.");
+        nodePtr->SetEnvironment(m_environment); // (note: redundant if already part of the network)
+        return nodePtr; // allows e.g. return AddNodeToNet(New...);
+    }
+
+    // remove a node from the network's node set
+    // This does NOT update any links referencing it, or node groups.
+    // TODO: We should verify that indeed this node is not referenced by other nodes or node groups,
+    //       nor that this node references any node inside the network.
+    ComputationNodeBasePtr RemoveNodeFromNet(const ComputationNodeBasePtr& node)
+    {
+        node->SetEnvironment(nullptr);
+        m_nameToNodeMap.erase(node->NodeName());
+        return node;
+    }
+
 public:
     // -----------------------------------------------------------------------
     // evaluation
@@ -847,7 +870,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         m_randomSeedOffset = value;
     }
 
-protected:
+private://protected:
     DEVICEID_TYPE m_deviceId; // TODO: is this shared by all nodes?
     unsigned long m_randomSeedOffset;
 
@@ -869,9 +892,12 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
 
     // used for sentence boundary information passed from reader to reset RNN state
     // specify how the minibatch is packed for each sample
-    // TODO: This will change once we allow for multiple inconsistent layouts.
+    // BUGBUG: With variable-length inconsistent layouts, this can no longer be a network property.
     MBLayoutPtr m_pMBLayout; // note that this must be installed before doing anything that needs it (default leaves a nullptr)
 
+    // environment information that nodes may want to inquire, e.g. to know whether we are training
+    ComputationEnvironmentPtr m_environment;
+
 private:
     // -----------------------------------------------------------------------
     // the following members are all result of post-processing by CompileNetwork()
@@ -908,8 +934,5 @@ template class Matrix<double>;
 // TODOs:
 //  - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
 //  - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation
-//  - code prettification:
-//     - sort all node implementations' methods into the same order; esp, ForwardProp() comes before partial
-//     - sort important nodes first; move unused/experimental nodes into source files named accordingly
 
-} } }
+}}}
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
index 5707878e4dec..e022291bf408 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -103,30 +103,26 @@ void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring to
 // RenameNode - Rename a node to another name
 // nodeNameOrig - original node name
 // nodeNameNew - new node name
-void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew)
+void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& newNodeName)
 {
-    InvalidateCompiledNetwork();
-
-    ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig);
+    RenameNode(GetNodeFromName(nodeNameOrig), newNodeName);
+}
 
-    auto iter = m_nameToNodeMap.find(nodeNameNew);
+void ComputationNetwork::RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName)
+{
+    // make sure the new name is not already used
+    auto iter = m_nameToNodeMap.find(newNodeName);
     if (iter != m_nameToNodeMap.end()) // found
         RuntimeError("RenameNode: Target name already exists.");
 
-    // rename the node and update the mapping table
-    nodeToRename->SetNodeName(nodeNameNew);
-    m_nameToNodeMap.erase(nodeNameOrig);
-    m_nameToNodeMap[nodeNameNew] = nodeToRename;
-}
+    InvalidateCompiledNetwork();
 
-void ComputationNetwork::RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName)
-{
-    // TODO: check if new name exists
-    m_nameToNodeMap.erase(node->NodeName());
-    node->SetNodeName(newNodeName);
-    AddNodeToNet(node);
+    RemoveNodeFromNet(node);        // take it out remporarily
+    node->SetNodeName(newNodeName); // change the name
+    AddNodeToNet(node);             // and put it back
 }
 
+// deletes a node from the network including setting all input links to it to null, and removing it from the node groups
 void ComputationNetwork::DeleteNode(const std::wstring& nodeName)
 {
     InvalidateCompiledNetwork();
@@ -165,20 +161,24 @@ void ComputationNetwork::DeleteNode(const std::wstring& nodeName)
     // Note: the necessary update of m_allSEQNodes is hanlded by the InvalidateCompiledNetwork() call above
 
     // delete the node itself
-    m_nameToNodeMap.erase(nodeName); // this will deref the node and possibly deallocate it
+    RemoveNodeFromNet(nodeToDelete);
 }
 
-// change the node associated with nodeName to newNode; used in the KL-reg based adaptation to reduce feature copy
-// need to update all the mappings as well childrens
+// replace a named node by newNode of the same type under the same name, including moving over all network links
+// This is used in the KL-reg based adaptation to reduce feature copy
+// need to update all the mappings as well childrens.
 void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode)
 {
-    InvalidateCompiledNetwork();
-
     ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName);
+
+    if (newNode->NodeName() != nodeName) // TODO: This was not tested for earlier; I hope no code depends on this.
+        InvalidArgument("ChangeNode: newNode must have the same name as the old node.");
     if (oldNode->OperationName() != newNode->OperationName())
-        InvalidArgument("newNode must have the same type as the old node.");
+        InvalidArgument("ChangeNode: newNode must have the same type as the old node.");
 
-    // change children
+    InvalidateCompiledNetwork();
+
+    // change all nodes to have old node as input to point to the new node instead
     for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
     {
         ComputationNodeBasePtr node = nodeIter->second;
@@ -187,12 +187,18 @@ void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr new
                 node->SetInput(i, newNode);
     }
 
-    // change name map
-    m_nameToNodeMap[nodeName] = newNode;
+    // change all inputs of this new node to share the old one's inputs
     for (int i = 0; i < oldNode->GetNumInputs(); i++)
-        newNode->SetInput(i, oldNode->GetInputs()[i]);
+    {
+        newNode->SetInput(i, oldNode->GetInputs()[i]); // TODO: use AttachInput()?
+        //oldNode->SetInput(i, nullptr); // BUGBUG: old node should no longer point into the network
+    }
+
+    // replace the node in the network
+    RemoveNodeFromNet(oldNode);
+    AddNodeToNet(newNode);
 
-    // change other maps
+    // also update node groups
     for (auto groupIter : GetAllNodeGroups())
     {
         auto& group = *groupIter;
@@ -204,13 +210,17 @@ void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr new
 
 // replace the old node with the current node, assuming the old node is a leaf node
 // need to update those nodes who use oldNode as their child
+// TODO: Can this be called with a node that's already part of the network? This is currently allowed, but should it?
+// BUGBUG: Seems ChangeNode() also updates node groups. Why doesn't this function?
+// BUGBUG: What if newNode is the one referenced by oldNodeName?
+// BUGBUG: Or what if an unrelated node of the same name exists?
 void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
 {
     InvalidateCompiledNetwork();
 
     ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName);
 
-    // change the input of those nodes whose child is oldNode
+    // relink the input of those nodes whose child is oldNode to point to the new one instead
     for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
     {
         ComputationNodeBasePtr node = nodeIter->second;
@@ -218,18 +228,19 @@ void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBas
             if (node->GetInputs()[i] == oldNode)
                 node->SetInput(i, newNode);
     }
-    m_nameToNodeMap[newNode->GetName()] = newNode;
 
-    // now the old node becomes a orphan node , remove it
-    DeleteNode(oldNodeName);
-    // RemoveOrphanNode(oldNode);
+    // add the new, remove the old
+    AddNodeToNetIfNotYet(newNode);
+    DeleteNode(oldNodeName); // TODO: can this just be RemoveNodeFromNet()?
 }
 
+// add a new criterion node and at the same time orphan the previous one (it won't be removed)
+// BUGBUG: Can this operate on both new and existing nodes?
 void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
 {
     InvalidateCompiledNetwork();
 
-    // Checks if the node is a criterion node.
+    // checks if the node is a criterion node
     int index = -1;
     for (int i = 0; i < m_finalCriteria.size(); ++i)
     {
@@ -242,32 +253,32 @@ void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, Computat
     if (index == -1)
         RuntimeError("ReplaceFinalCriterionNode: the node to be replaced is not a criterion node.");
 
-    // Replaces children.
+    // replace children
     for (int i = 0; i < newNode->GetNumInputs(); ++i)
     {
         if (m_nameToNodeMap.find(newNode->GetInputs()[i]->NodeName()) == m_nameToNodeMap.end())
             RuntimeError("Child node does not exist.");
         newNode->SetInput(i, m_nameToNodeMap[newNode->GetInputs()[i]->NodeName()]);
+        // TODO: Remove the strange indirection through nameToNodeMap, just use the ptr directly?
     }
 
-    // Addes it to criterion node list.
+    // add it to the network
+    AddNodeToNetIfNotYet(newNode);
+
+    // add it to criterion node list
     m_finalCriteria[index] = newNode;
-    m_nameToNodeMap[newNode->NodeName()] = newNode;
 }
 
 void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode)
 {
     InvalidateCompiledNetwork();
 
-    wstring nodeName = featureNode->NodeName();
-    if (NodeNameExists(nodeName))
-        RuntimeError("AddFeatureNode: feature node already exists.");
-    m_nameToNodeMap[nodeName] = featureNode;
+    AddNodeToNet(featureNode);
     m_features.push_back(featureNode);
 }
 
-// We only remove the node, not delete it.
-void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
+// We only remove the node from the net, not destruct it.
+ComputationNodeBasePtr ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
 {
     InvalidateCompiledNetwork();
 
@@ -275,7 +286,7 @@ void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
     if (!NodeNameExists(nodeName))
         RuntimeError("RemoveFeatureNode: feature node does not exist.");
 
-    // Removes links.
+    // removes links
     for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter)
     {
         ComputationNodeBasePtr node = nodeIter->second;
@@ -295,7 +306,7 @@ void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
     if (search != m_features.end())
         m_features.erase(search);
 
-    m_nameToNodeMap.erase(nodeName);
+    return RemoveNodeFromNet(featureNode);
 }
 
 // sets m_learningRateMultiplier in all LearnableParameters feeding into the passed rootNode
@@ -360,4 +371,5 @@ void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool eval
         }
     }
 }
-} } }
+
+}}}
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
index 2c08f30899a9..82e7df227d79 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@@ -137,6 +137,7 @@
     <ClInclude Include="..\Common\Include\Sequences.h" />
     <ClInclude Include="..\Common\Include\TimerUtility.h" />
     <ClInclude Include="..\Math\Matrix.h" />
+    <ClInclude Include="ComputationEnvironment.h" />
     <ClInclude Include="ComputationNetwork.h" />
     <ClInclude Include="ComputationNetworkBuilder.h" />
     <ClInclude Include="ComputationNode.h" />
diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
index ac39ae84bf79..214a243a350a 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@@ -126,6 +126,9 @@
     <ClInclude Include="TrainingNodes.h">
       <Filter>Nodes</Filter>
     </ClInclude>
+    <ClInclude Include="ComputationEnvironment.h">
+      <Filter>Environment</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Common">
@@ -149,5 +152,8 @@
     <Filter Include="from Math">
       <UniqueIdentifier>{7d838fa4-b5a1-4b8a-b37d-823fb026055b}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Environment">
+      <UniqueIdentifier>{ed685e39-b7dd-4546-a865-149664fa71a4}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index a7a9c521db22..c80866c6e5d5 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -11,6 +11,7 @@
 #include "Sequences.h"
 #include "TensorShape.h"
 #include "MatrixPool.h"
+#include "ComputationEnvironment.h"
 
 #include <unordered_set>
 #include <map>
@@ -610,6 +611,14 @@ protected: public: // ...the following should be protected, but nodes inquire ab
     // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
     virtual bool /*IComputationNode::*/ RequiresPreCompute() const { return false; }
 
+    const ComputationEnvironment& Environment()
+    {
+        if (!m_environment)
+            LogicError("Environment: No environment has been set.");
+        return *m_environment;
+    }
+    void SetEnvironment(ComputationEnvironmentPtr environment) { m_environment = environment; }
+
     // -----------------------------------------------------------------------
     // validation
     // -----------------------------------------------------------------------
@@ -816,6 +825,10 @@ protected: public: // ...the following should be protected, but nodes inquire ab
     TensorShape m_sampleLayout; // sample layout
     MBLayoutPtr m_pMBLayout;
 
+    // environment information
+    // This structure is shared with the ComputationNetwork that this node lives in
+    ComputationEnvironmentPtr m_environment;
+
     // flags related to gradient propagation
     float m_learningRateMultiplier;    // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
     bool m_gradientInitialized;        // indicates whether the gradient matrix has been resized and initialized to 0
@@ -1962,4 +1975,5 @@ class BinaryElementWiseNode : public ComputationNode<ElemType>, public NumInputs
 #define UsingBinaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate;
 
 #pragma endregion base computation class
-} } }
+
+}}}
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 39b9d74ca0ef..f0387c6f7108 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -112,6 +112,7 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
     using BaseMatrix<ElemType>::GetArray;
     using BaseMatrix<ElemType>::GetNumRows;
     using BaseMatrix<ElemType>::GetNumCols;
+    using BaseMatrix<ElemType>::VerifySize;
 
 private:
     static cublasHandle_t s_cuHandle[MaxGpus];

From 3cd0308aa127694f75a99532f421402c7675bae5 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sun, 13 Mar 2016 15:34:36 -0700
Subject: [PATCH 15/26] new node EnvironmentInput, which exposes the isTraining
 property to graph operations

---
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  1 +
 .../ComputationEnvironment.h                  | 45 +++++++--
 .../ComputationNetwork.h                      | 21 +++--
 .../ComputationNetworkBuilder.cpp             |  1 +
 .../ComputationNetworkEvaluation.cpp          |  3 +
 .../ComputationNetworkScripting.cpp           |  9 +-
 .../ComputationNetworkLib/ComputationNode.h   |  3 +-
 .../InputAndParamNodes.h                      | 94 ++++++++++++++++++-
 .../ComputationNetworkLib/PreComputeNodes.h   |  2 +
 Source/ComputationNetworkLib/TrainingNodes.h  | 20 ++--
 Source/SGDLib/SGD.cpp                         | 14 ++-
 Source/SGDLib/SimpleEvaluator.h               |  2 +
 Source/SGDLib/SimpleOutputWriter.h            |  4 +
 13 files changed, 173 insertions(+), 46 deletions(-)

diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index ece5ffd5412f..94108f9d3853 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -36,6 +36,7 @@ Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; sh
 SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
 ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
 SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
+EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
 Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
 PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
diff --git a/Source/ComputationNetworkLib/ComputationEnvironment.h b/Source/ComputationNetworkLib/ComputationEnvironment.h
index d860208d3431..58c2c8e2f301 100644
--- a/Source/ComputationNetworkLib/ComputationEnvironment.h
+++ b/Source/ComputationNetworkLib/ComputationEnvironment.h
@@ -6,27 +6,56 @@
 #pragma once
 
 #include "Basics.h"
-
 #include <memory>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 // ===========================================================================
-// ComputationEnvironment -- computation graph and operations
+// ComputationEnvironment -- global network properties of interest to nodes
 // ===========================================================================
 
+// mode that the network is currently used in, which affects node behavior
 enum class NetworkOperationMode
 {
-    unspecified,
-    training,
-    inferring,
-    precomputing
+    training,    // training mode specifically means nodes should behave like training (e.g. Dropout should be active)
+    inferring,   // inferring (e.g. BatchNorm should not update mean estimates)
+    preComputing // precomputation is a part of training where most nodes should behave like they are inferring
 };
 
+// class to store global properties of the network that are of interest to the nodes
+// For example, a network can be in 'training' or 'inference' mode, which affects what nodes like Dropout and BN do,
+// or what the seq-2-seq decoder feedback signal is.
 struct ComputationEnvironment
 {
-    NetworkOperationMode networkOperationMode = NetworkOperationMode::unspecified;
+    // networkOperationMode tells whether we are training or inferring, which affects some nodes' behavior
+    NetworkOperationMode networkOperationMode = NetworkOperationMode::inferring; // by default, a network is always able to infer
+    bool IsTraining()     const { return networkOperationMode == NetworkOperationMode::training; }
+    bool IsPreComputing() const { return networkOperationMode == NetworkOperationMode::preComputing; }
+
+    // more properties should be added here as needed
+};
+typedef std::shared_ptr<ComputationEnvironment> ComputationEnvironmentPtr;
+
+// RAII wrapper for setting and reverting ComputationEnvironment::networkOperationMode
+// E.g. ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
+// will set the mode until the end of the scope, and then revert to its old value automatically.
+class ScopedNetworkOperationMode
+{
+    ComputationEnvironment& m_environment;
+    NetworkOperationMode m_previousNetworkOperationMode;
+    void operator=(const ScopedNetworkOperationMode&) = delete;
+public:
+    template<class ComputationNetwork> // using template to avoid dependency
+    ScopedNetworkOperationMode(const std::shared_ptr<ComputationNetwork>& net, NetworkOperationMode networkOperationMode) :
+        m_environment(net->Environment())
+    {
+        m_previousNetworkOperationMode = m_environment.networkOperationMode;
+        m_environment.networkOperationMode = networkOperationMode;
+    }
+    ~ScopedNetworkOperationMode() // destructor restores the previous mode
+    {
+        m_environment.networkOperationMode = m_previousNetworkOperationMode;
+    }
 };
-typedef shared_ptr<ComputationEnvironment> ComputationEnvironmentPtr;
 
 }}}
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 87c5b7061f89..7dd540e14dcd 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -385,10 +385,17 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         return nodes;
     }
 
+    // -----------------------------------------------------------------------
+    // environment properties
+    // -----------------------------------------------------------------------
+
+    ComputationEnvironment& Environment() const { return *m_environment; }
+
     // -----------------------------------------------------------------------
     // functions to pass on specific SGD options to nodes
     // -----------------------------------------------------------------------
 
+    // TODO: Why are all these static, but then take a network as the first argument? --> make them class members
     template <class ElemType>
     static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
 
@@ -398,7 +405,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     template <class ElemType>
     static void SetSeqParam(ComputationNetworkPtr net,
                             const ComputationNodeBasePtr criterionNode,
-                            const double& hsmoothingWeight,
+                            const double& hsmoothingWeight, // TODO: Why are all these passed by reference?
                             const double& frameDropThresh,
                             const bool& doreferencealign,
                             const double& amf = 14.0f,
@@ -406,6 +413,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
                             const double& wp = 0.0f,
                             const double& bMMIfactor = 0.0f,
                             const bool& sMBR = false);
+
     static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);
 
     // -----------------------------------------------------------------------
@@ -474,12 +482,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
         return m_nameToNodeMap.size();
     }
 
-    // TODO: could be a dup
-    std::map<const std::wstring, ComputationNodeBasePtr, nocase_compare>& GetNameToNodeMap() // specially for ExperimentalNetworkBuilder; don't use this otherwise
-    {
-        return m_nameToNodeMap;
-    }
-
     std::vector<ComputationNodeBasePtr> GetAllNodes() const
     {
         std::vector<ComputationNodeBasePtr> nodes;
@@ -595,13 +597,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     }
 
     // add a node to the network unless it's already there
-    ComputationNodeBasePtr AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr)
+    // Returns false if the node was already there.
+    bool AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr)
     {
         auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr));
         if (!result.second && result.first->second != nodePtr) // if there's already one under this name, it better be nodePtr
             RuntimeError("AddNodeToNetIfNotYet: Duplicated computation node name.");
         nodePtr->SetEnvironment(m_environment); // (note: redundant if already part of the network)
-        return nodePtr; // allows e.g. return AddNodeToNet(New...);
+        return result.second;
     }
 
     // remove a node from the network's node set
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index 2fed249ca72a..72eb3792b4c6 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -49,6 +49,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
     else if (nodeType == OperationNameOf(DropoutNode))                          return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(DummyCriterionNode))                   return New<DummyCriterionNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(ElementTimesNode))                     return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(EnvironmentInputNode))                 return New<EnvironmentInputNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(ErrorPredictionNode))                  return New<ErrorPredictionNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(ExpNode))                              return New<ExpNode<ElemType>>(forward<_Types>(_Args)...);
     else if (nodeType == OperationNameOf(FutureValueNode))                      return New<FutureValueNode<ElemType>>(forward<_Types>(_Args)...);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 957c257da5fa..7e1c3de72218 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -66,6 +66,9 @@ static bool SetGradientToScalarOne(ComputationNodeBasePtr nodep)
 //  - Backprop() for the training criterion
 void ComputationNetwork::Backprop(const ComputationNodeBasePtr rootNode) // training criterion to compute the gradients for
 {
+    if (!Environment().IsTraining())
+        LogicError("Backprop: Requires network is to be in training mode.");
+
     // reset all gradients to zero (actually, internally, this is lazy, but we don't care here)
     ZeroGradients(rootNode);
 
diff --git a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
index e7f1be1803a9..b11f944cc5c7 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@@ -65,12 +65,9 @@ ComputationNetwork::ComputationNetwork(const IConfigRecordPtr configp)
         workList.pop_front();
 
         // add to set
-        let res = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
-        if (!res.second) // not inserted: we already got this one
-            if (res.first->second == node)
-                continue; // the same
-            else          // oops, a different node with the same name
-                LogicError("ComputationNetwork: multiple nodes with the same NodeName() '%ls'", node->NodeName().c_str());
+        let wasAdded = AddNodeToNetIfNotYet(node);
+        if (!wasAdded) // node already there (above will fail if there is a different node with the same name)
+            continue;
 
         // If node derives from ILateAttachingNode() then it has unresolved inputs. Resolve them now.
         // This may generate a whole new load of nodes, including nodes which in turn have late init.
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index c80866c6e5d5..976c83cd72db 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -611,7 +611,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab
     // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
     virtual bool /*IComputationNode::*/ RequiresPreCompute() const { return false; }
 
-    const ComputationEnvironment& Environment()
+    const ComputationEnvironment& Environment() const
     {
         if (!m_environment)
             LogicError("Environment: No environment has been set.");
@@ -1799,6 +1799,7 @@ protected:
     using Base::DetermineElementwiseTensorRank;                                                                                                          \
     using Base::DumpNodeInfo;                                                                                                                            \
     using Base::EnumerateNodes;                                                                                                                          \
+    using Base::Environment;                                                                                                                             \
     using Base::ForwardProp;                                                                                                                             \
     using Base::GetAsMatrixNumCols;                                                                                                                      \
     using Base::GetAsMatrixNumRows;                                                                                                                      \
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 4e015b4239ab..e836e1f63279 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -253,11 +253,13 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
     {
     }
 
-    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override
     {
     }
-    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
     {
+        LogicError("%ls %ls operation is a leaf node. BackpropTo() should never be called.", NodeName().c_str(), OperationName().c_str());
     }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@@ -410,10 +412,12 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override
     {
+        // we have been filled by the Reader
     }
+
     virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&)
     {
-        LogicError("InputValueBase::BackpropTo() should never be called.");
+        LogicError("%ls is a leaf node. BackpropTo() should never be called.", NodeName().c_str());
     }
 
     virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
@@ -510,6 +514,90 @@ class SparseInputValue : public InputValueBase<ElemType>
 template class SparseInputValue<float>;
 template class SparseInputValue<double>;
 
+// -----------------------------------------------------------------------
+// EnvironmentInput (propertyName) -- read out environment properties
+// Such as whether we are currently training or evaluating, which can affect
+// behavior, such as seq-2-seq decoding.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class EnvironmentInputNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<0>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"EnvironmentInput"; }
+
+public:
+    EnvironmentInputNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& propertyName = L"") :
+        Base(deviceId, name), m_propertyName(propertyName)
+    {
+    }
+    EnvironmentInputNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : EnvironmentInputNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"propertyName"))
+    {
+    }
+
+    virtual void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_propertyName;
+    }
+
+    virtual void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+        fstream >> m_propertyName;
+    }
+
+private:
+    ElemType ReadOutVariable() const
+    {
+        const auto& e = Environment();
+        if (m_propertyName == L"isTraining")
+            return (ElemType)e.IsTraining();
+        else
+            InvalidArgument("EnvironmentInput: There is no environment property '%ls'", m_propertyName.c_str());
+    }
+
+public:
+    // TODO: Noone else overrides this method. So is this the right mechanism?
+    //       On the other hand, we are also the only leaf that needs to update itself.
+    virtual bool /*ComputationNodeBase::*/ IsOutOfDateWrtInputs() const override { return true; }
+
+    virtual void /*IComputationNode::*/ BeginForwardProp() override
+    {
+        // We are a leaf, so UpdateFunctionValuesSize() won't be called for us.
+        UpdateFunctionValuesSize();
+        Base::BeginForwardProp();
+    }
+
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
+    {
+        ElemType val = ReadOutVariable();
+        Value().VerifySize(1, 1);
+        Value().SetValue(val);
+    }
+
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override
+    {
+        LogicError("%ls %ls operation is a leaf node. BackpropTo() should never be called.", NodeName().c_str(), OperationName().c_str());
+    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        ReadOutVariable(); // read out the value once, with the purpose of validating the variableName
+        Base::Validate(isFinalValidationPass);
+        // this node does not hold mini-batch data
+        m_pMBLayout = nullptr;
+        // for now, anything this node returns is a scalar
+        SetDims(TensorShape(1), false);
+    }
+
+private:
+    wstring m_propertyName;
+};
+
 // -----------------------------------------------------------------------
 // LookupTableNode (embedding matrix, bag-of-word representation of the inputs)
 // Implements an embedding. The input vector can consist of multiple stacked
diff --git a/Source/ComputationNetworkLib/PreComputeNodes.h b/Source/ComputationNetworkLib/PreComputeNodes.h
index 96ae72479476..6a8e5e8c8532 100644
--- a/Source/ComputationNetworkLib/PreComputeNodes.h
+++ b/Source/ComputationNetworkLib/PreComputeNodes.h
@@ -46,6 +46,8 @@ class PreComputedNodeBase : public ComputationNodeNonLooping /*ComputationNode*/
     // This is used for resetting and updating from accumulators.
     virtual void /*IPreComputeNode::*/ MarkComputed(const bool hasComputed) override
     {
+        if (!Environment().IsPreComputing())
+            LogicError("MarkComputed: Network must be in preComputing mode.");
         m_hasComputed = hasComputed;
     }
 
diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h
index 1450f1787dae..9d988e212cbc 100644
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@@ -370,12 +370,8 @@ template class CrossEntropyNode<double>;
 template <class ElemType>
 class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"MatrixL1Reg";
-    }
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"MatrixL1Reg"; }
 
 public:
     DeclareConstructorFromConfigWithNumInputs(MatrixL1RegNode);
@@ -384,7 +380,7 @@ class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
     {
     }
 
-    virtual void BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
     {
         FrameRange fr(Input(0)->GetMBLayout());
         assert(inputIndex == 0);
@@ -463,12 +459,8 @@ template class MatrixL1RegNode<double>;
 template <class ElemType>
 class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"MatrixL2Reg";
-    }
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"MatrixL2Reg"; }
 
 public:
     DeclareConstructorFromConfigWithNumInputs(MatrixL2RegNode);
@@ -477,7 +469,7 @@ class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/<Ele
     {
     }
 
-    virtual void BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
+    virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override // scale by number of cols (or samples)
     {
         FrameRange fr(Input(0)->GetMBLayout());
         assert(inputIndex == 0);
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 82404e2f2b5b..5b086aa709a7 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -732,6 +732,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                                     /*in/out*/ size_t& totalSamplesSeen,
                                     std::string prefixMsg)
 {
+    ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
+
     double totalTimeInMBs = 0; // use double since timer has sub-microsecond time resolution
     double epochCriterionLastMBs = 0;
 
@@ -764,8 +766,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
     {
         m_pMASGDHelper->OnEpochStart(learnableNodes);
     }
-    
-
 
     std::vector<Matrix<ElemType>*> learnParamsGradients;
     if (useGradientAggregation)
@@ -1287,13 +1287,15 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,
         fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str());
 
     // compute
+    ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::preComputing);
+
     // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
     // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
     // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's,
     // so we need to use all the data to do precomputing
     if (m_useAllDataForPreComputedNode) // using all the data
         trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
-    else // using only one epoch
+    else // using only one epoch. Note: One epoch is often enough for feature mean/stddev, but not for estimating priors.
         trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
     net->StartEvaluateMinibatchLoop(nodes);
 
@@ -2179,10 +2181,12 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
                                   const std::list<ComputationNodeBasePtr>& learnableNodes,
                                   int npos)
 {
-    vector<string> errMsgs;
+    ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
 
     net->StartEvaluateMinibatchLoop(criterionNodes[npos]);
 
+    vector<string> errMsgs; // TODO: These are created but actually not returned, only their count is checked.
+
     // gradient checking
     for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
     {
@@ -2264,7 +2268,7 @@ bool SGD<ElemType>::GradientCheck(ComputationNetworkPtr net,
         }
     }
 
-    return errMsgs.size() == 0;
+    return errMsgs.empty();
 }
 
 template class SGD<float>;
diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h
index be31c0fae794..0e734dda0c5d 100644
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@@ -47,6 +47,8 @@ class SimpleEvaluator
     // returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes)
     vector<double> Evaluate(IDataReader* dataReader, const vector<wstring>& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize)
     {
+        ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
+
         // determine nodes to evaluate
         std::vector<ComputationNodeBasePtr> evalNodes;
 
diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h
index e6a8b057287b..812adbf4cd11 100644
--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@@ -87,6 +87,8 @@ class SimpleOutputWriter
 
     void WriteOutput(IDataReader& dataReader, size_t mbSize, IDataWriter& dataWriter, const std::vector<std::wstring>& outputNodeNames, size_t numOutputSamples = requestDataSize, bool doUnitTest = false)
     {
+        ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
+
         std::vector<ComputationNodeBasePtr> outputNodes = DetermineOutputNodes(outputNodeNames);
         std::vector<ComputationNodeBasePtr> inputNodes  = DetermineInputNodes(outputNodes);
 
@@ -190,6 +192,8 @@ class SimpleOutputWriter
     // TODO: Remove code dup with above function by creating a fake Writer object and then calling the other function.
     void WriteOutput(IDataReader& dataReader, size_t mbSize, std::wstring outputPath, const std::vector<std::wstring>& outputNodeNames, const WriteFormattingOptions & formattingOptions, size_t numOutputSamples = requestDataSize)
     {
+        ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring);
+
         std::vector<ComputationNodeBasePtr> outputNodes = DetermineOutputNodes(outputNodeNames);
         std::vector<ComputationNodeBasePtr> inputNodes = DetermineInputNodes(outputNodes);
 

From abfc23e041b89f2c2ddc32ec0e79fba58ebba4c4 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 14 Mar 2016 20:01:31 -0700
Subject: [PATCH 16/26] writeWordAndClassInfo can now also writes a regular
 labelMapping file; WriteMinibatchWithFormatting() now supports packed
 sequences

---
 Source/ActionsLib/OtherActions.cpp            | 111 ++++++++++--------
 .../ComputationNetworkLib/ComputationNode.cpp |  68 +++++------
 .../LMSequenceReader/SequenceReader.cpp       |   2 +
 3 files changed, 99 insertions(+), 82 deletions(-)

diff --git a/Source/ActionsLib/OtherActions.cpp b/Source/ActionsLib/OtherActions.cpp
index 39cb6ac45ccc..1d7629600341 100644
--- a/Source/ActionsLib/OtherActions.cpp
+++ b/Source/ActionsLib/OtherActions.cpp
@@ -205,13 +205,18 @@ template void DoParameterSVD<double>(const ConfigParameters& config);
 // DoWriteWordAndClassInfo() - implements CNTK "writeWordAndClass" command
 // ===========================================================================
 
-// BUGBUG: This should compare both elements (first one is the word name). This current version leads to different sorting and thus class definitions with VS and gcc.
+// compare functor to for sorting by the second element of a pair
+// TODO: just use a lambda
 template <typename T>
 struct compare_second
 {
     bool operator()(const T& lhs, const T& rhs) const
     {
-        return lhs.second < rhs.second;
+        // BUGBUG: This should compare both elements (first one is the word name). This current version leads to different sorting and thus class definitions with VS and gcc.
+        //if (lhs.second == rhs.second) // if second element
+        //    return lhs.first < rhs.first;
+        //else
+            return lhs.second < rhs.second;
     }
 };
 
@@ -242,8 +247,9 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
     int cutoff = config(L"cutoff", "1");
 
     string inputFile = config(L"inputFile"); // training text file without <unk>
-    string outputVocabFile = config(L"outputVocabFile");
-    string outputWord2Cls  = nbrCls > 0 ? config(L"outputWord2Cls")  : string();
+    string outputMappingFile = config(L"outputMappingFile", ""); // if specified then write a regular mapping file
+    string outputVocabFile   = config(L"outputVocabFile");
+    string outputWord2Cls  = nbrCls > 0 ? config(L"outputWord2Cls") : string();
     string outputCls2Index = nbrCls > 0 ? config(L"outputCls2Index") : string();
 
     string unkWord       = config(L"unk", "<unk>");
@@ -254,14 +260,16 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
     if (beginSequence.empty() || endSequence.empty())
         InvalidArgument("Please specify parameters 'beginSequence' and 'endSequence'.");
 
-    std::cerr     << "Vocabulary file:    " << outputVocabFile << std::endl;
+    if (!outputMappingFile.empty())
+        cerr << "Mapping file       --> " << outputVocabFile << endl;
+    cerr     << "Vocabulary file    --> " << outputVocabFile << endl;
     if (nbrCls > 0)
     {
-        std::cerr << "Word-to-class map:  " << outputWord2Cls  << std::endl;
-        std::cerr << "Class-to-index map: " << outputCls2Index << std::endl;
+        cerr << "Word-to-class map  --> " << outputWord2Cls  << endl;
+        cerr << "Class-to-index map --> " << outputCls2Index << endl;
     }
-    std::cerr << std::endl;
-    
+    cerr << endl;
+
     // check whether we are already up-to-date
     bool makeMode = config(L"makeMode", true);
     if (makeMode)
@@ -274,7 +282,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
         }
         if (done)
         {
-            std::cerr << "All output files up to date.\n";
+            cerr << "All output files up to date.\n";
             return;
         }
     }
@@ -285,19 +293,12 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
     ifstream fp(inputFile.c_str()); // TODO: use class File, as to support pipes
     if (!fp)
         RuntimeError("Failed to open input file: %s", inputFile.c_str());
-    cerr << "Reading input file inputFile: " << inputFile << std::endl;
+    cerr << "Reading input file inputFile: " << inputFile << endl;
 
     if (nbrCls > 0)
         cls2idx.Resize(nbrCls, 1);
 
-#if 1
-    std::unordered_map<string, double> v_count;
-#else
-    // TODO: For unknown reasons, this gives a very different result (PPL of 500 instead of 190). Should be tracked down.
-    std::map<string, double> v_count;
-    v_count[beginSequence] = 0;  // get these into the table upfront into position 0 (and 1 if different)
-    v_count[endSequence]   = 0;
-#endif
+    unordered_map<string, double> v_count;
 
     // process input line by line
     string str;
@@ -323,18 +324,14 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
     }
     fp.close();
 
-    std::cerr << "Vocabulary size " << v_count.size() << ".\n";
-
-    std::vector<std::string> m_words;
-    std::set<std::string> m_remained_words;
-    std::unordered_map<std::string, size_t> m_index;
+    cerr << "Vocabulary size " << v_count.size() << ".\n";
 
-    std::vector<double> m_count;
-    std::vector<int> m_class; // class index of each word
+    vector<string> m_words;
+    set<string> m_remained_words;
+    unordered_map<string, size_t> m_index;
 
-    typedef std::pair<std::string, double> stringdouble;
-    std::priority_queue<stringdouble, std::vector<stringdouble>, compare_second<stringdouble>>
-        q(compare_second<stringdouble>(), std::vector<stringdouble>(v_count.begin(), v_count.end()));
+    vector<double> m_count;
+    vector<int> m_class; // class index of each word
 
     size_t wordCountLessCutoff = v_count.size();
     if (cutoff > 0)
@@ -348,24 +345,30 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
 
     if (vocabSize > wordCountLessCutoff)
     {
-        std::cerr << "Warning: actual vocabulary size is less than required." << endl;
-        std::cerr << "\t\tRequired vocabulary size:" << vocabSize << endl;
-        std::cerr << "\t\tActual vocabulary size:" << v_count.size() << endl;
-        std::cerr << "\t\tActual vocabulary size after cutoff:" << wordCountLessCutoff << endl;
-        std::cerr << "\t\tWe will change to actual vocabulary size: " << wordCountLessCutoff << endl;
+        cerr << "Warning: actual vocabulary size is less than required." << endl;
+        cerr << "\t\tRequired vocabulary size:" << vocabSize << endl;
+        cerr << "\t\tActual vocabulary size:" << v_count.size() << endl;
+        cerr << "\t\tActual vocabulary size after cutoff:" << wordCountLessCutoff << endl;
+        cerr << "\t\tWe will change to actual vocabulary size: " << wordCountLessCutoff << endl;
         vocabSize = wordCountLessCutoff;
     }
+
+    // form classes
+    // Implements an algorithm by Mikolov --TODO: get the reference
     wrd2cls.Resize(vocabSize, 1);
 
-    std::unordered_map<std::string, double> removed;
-    double unkCount = 0;
+    typedef pair<string, double> stringdouble;
+    unordered_map<string, double> removed; // note: std::map is supposedly faster
+    double unkCount = 0; // TODO: why double?
     size_t size = 0;
     size_t actual_vocab_size = vocabSize - 1;
-    while (size < actual_vocab_size && !q.empty())
+    priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
+        q(compare_second<stringdouble>(), vector<stringdouble>(v_count.begin(), v_count.end()));
+    while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop())
     {
         size++;
-        std::string word = q.top().first;
-        double freq = q.top().second;
+        string word = q.top().first;
+        double freq = q.top().second; // TODO: why double?
         if (word == unkWord)
         {
             unkCount += freq;
@@ -380,8 +383,6 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
         q.pop();
     }
     removed[unkWord] = unkCount;
-    std::priority_queue<stringdouble, std::vector<stringdouble>, compare_second<stringdouble>>
-        p(compare_second<stringdouble>(), std::vector<stringdouble>(removed.begin(), removed.end()));
     m_count.resize(removed.size());
     double total = 0;
     double dd = 0;
@@ -396,11 +397,13 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
 
     double df = 0;
     size_t class_id = 0;
-    m_class.resize(p.size());
+    m_class.resize(removed.size());
 
+    priority_queue<stringdouble, vector<stringdouble>, compare_second<stringdouble>>
+        p(compare_second<stringdouble>(), vector<stringdouble>(removed.begin(), removed.end()));
     while (!p.empty())
     {
-        std::string word = p.top().first;
+        string word = p.top().first;
         double freq = p.top().second;
         if (nbrCls > 0)
         {
@@ -423,9 +426,19 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
         p.pop();
     }
 
-    std::ofstream ofvocab;
+    // write the files
+    if (!outputMappingFile.empty())
+    {
+        msra::files::make_intermediate_dirs(s2ws(outputMappingFile));
+        ofstream ofmapping(outputMappingFile.c_str());
+        for (size_t i = 0; i < m_index.size(); i++)
+            ofmapping << m_words[i] << endl;
+        ofmapping.close();
+        cerr << "Created label-mapping file with " << v_count.size() << " entries.\n";
+    }
+
     msra::files::make_intermediate_dirs(s2ws(outputVocabFile));
-    ofvocab.open(outputVocabFile.c_str());
+    ofstream ofvocab(outputVocabFile.c_str());
     for (size_t i = 0; i < m_index.size(); i++)
     {
         if (nbrCls > 0)
@@ -436,10 +449,10 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
             cls2idx(clsIdx, 0) = (ElemType) i; // the left boundary of clsIdx
             prevClsIdx = m_class[i];
         }
-        ofvocab << "     " << i << "\t     " << m_count[i] << "\t" << m_words[i] << "\t" << clsIdx << std::endl;
+        ofvocab << "     " << i << "\t     " << m_count[i] << "\t" << m_words[i] << "\t" << clsIdx << endl;
     }
     ofvocab.close();
-    std::cerr << "Created vocabulary file with " << v_count.size() << " entries.\n";
+    cerr << "Created vocabulary file with " << v_count.size() << " entries.\n";
 
     if (nbrCls > 0)
     {
@@ -452,7 +465,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
         for (size_t r = 0; r < wrd2cls.GetNumRows(); r++)
             owfp << (int) wrd2cls(r, 0) << endl;
         owfp.close();
-        std::cerr << "Created word-to-class map with " << wrd2cls.GetNumRows() << " entries.\n";
+        cerr << "Created word-to-class map with " << wrd2cls.GetNumRows() << " entries.\n";
 
         msra::files::make_intermediate_dirs(s2ws(outputCls2Index));
         ofstream ocfp(outputCls2Index.c_str());
@@ -461,7 +474,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config)
         for (size_t r = 0; r < cls2idx.GetNumRows(); r++)
             ocfp << (int) cls2idx(r, 0) << endl;
         ocfp.close();
-        std::cerr << "Created class-to-index map with " << cls2idx.GetNumRows() << " entries.\n";
+        cerr << "Created class-to-index map with " << cls2idx.GetNumRows() << " entries.\n";
     }
 }
 
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 01d398e762b8..1a3fcf792368 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -298,11 +298,13 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                                                              const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator,
                                                              const string& valueFormatString) const
 {
-    // get it (into a flat CPU-side vector)
+    // get minibatch matrix -> matData, matRows, matStride
     const Matrix<ElemType>& outputValues = Value();
-    size_t tempArraySize = 0;
-    ElemType* tempArray = nullptr;
-    outputValues.CopyToArray(tempArray, tempArraySize);
+    let matRows   = outputValues.GetNumRows();
+    let matStride = matRows; // how to get from one column to the next
+    ElemType* matData = nullptr;
+    size_t matDataSize = 0;
+    outputValues.CopyToArray(matData, matDataSize);
 
     // process all sequences one by one
     auto pMBLayout = GetMBLayout();
@@ -312,19 +314,21 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
         pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
         // TODO: This can be done more efficiently, if ever needed.
     }
-    const auto& sequences = pMBLayout->GetAllSequences();
-    size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next
-    size_t width = pMBLayout->GetNumTimeSteps();
+    let& sequences = pMBLayout->GetAllSequences();
+    let  width     = pMBLayout->GetNumTimeSteps();
     for (size_t s = 0; s < sequences.size(); s++)
     {
         const auto& seqInfo = sequences[s];
         if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print
             continue;
-        size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0;
-        size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width;
+        let tBegin = seqInfo.tBegin >= 0     ? seqInfo.tBegin : 0;
+        let tEnd   = seqInfo.tEnd   <= width ? seqInfo.tEnd   : width;
 
-        // current sequence is a matrix with 'colStride' beginning at the following pointer
-        ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin;
+        // get sequence matrix -> seqData, seqRows, seqCols, seqStride
+        let  seqData   = matData + pMBLayout->GetColumnIndex(seqInfo, 0) * matStride;
+        auto seqRows   = matRows;
+        let  seqCols   = tEnd - tBegin;
+        let  seqStride = pMBLayout->GetNumParallelSequences() * matStride;
 
         if (s > 0)
             fprintfOrDie(f, "%s", sequenceSeparator.c_str());
@@ -332,40 +336,39 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
 
         // output it according to our format specification
         let formatChar = valueFormatString.back();
-        size_t dim = outputValues.GetNumRows();
-        size_t T = tEnd - tBegin;
-        if (isCategoryLabel)
+        if (isCategoryLabel) // if is category then find the max value and output its index (possibly mapped to a string)
         {
             if (formatChar == 's') // verify label dimension
             {
                 if (outputValues.GetNumRows() != labelMapping.size())
-                    InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)dim, (int)labelMapping.size());
+                    InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)seqRows, (int)labelMapping.size());
             }
             // update the matrix in-place from one-hot (or max) to index
             // find the max in each column
-            for (size_t j = 0; j < T; j++)
+            for (size_t j = 0; j < seqCols; j++) // loop over all time steps of the sequence
             {
-                double maxPos = -1;
+                double maxLoc = -1;
                 double maxVal = 0;
-                for (size_t i = 0; i < dim; i++)
+                for (size_t i = 0; i < seqRows; i++) // loop over rows
                 {
-                    double val = pCurValue[i + j * dim * colStride];
-                    if (maxPos < 0 || val >= maxVal)
+                    let val = seqData[i + j * seqStride];
+                    if (maxLoc < 0 || val >= maxVal)
                     {
-                        maxPos = (double)i;
+                        maxLoc = (double)i;
                         maxVal = val;
                     }
                 }
-                pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place
+                seqData[0 + j * seqStride] = (ElemType)maxLoc; // overwrite first element in-place
             }
-            dim = 1; // ignore remaining dimensions
+            seqRows = 1; // ignore remaining dimensions
         }
-        let iend    = transpose ?         dim : T;         // true dimension of the data to print
-        let jend    = transpose ?           T : dim;
+        // bounds for printing
+        let iend    = transpose ?     seqRows : seqCols;         // true dimension of the data to print
+        let jend    = transpose ?     seqCols : seqRows;
         let istop   = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices)
         let jstop   = transpose ?   onlyUpToT : onlyUpToRow;
-        let istride = transpose ?           1 : colStride;
-        let jstride = transpose ?   colStride : 1;
+        let istride = transpose ?           1 : seqStride;
+        let jstride = transpose ?   seqStride : 1;
         for (size_t j = 0; j < jend; j++)
         {
             if (j > 0)
@@ -384,19 +387,18 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                     fprintf(f, "...+%d", (int)(iend - istop));
                     break;
                 }
-                else if (formatChar == 'f') // print as real number
+                double dval = seqData[i * istride + j * jstride];
+                if (formatChar == 'f') // print as real number
                 {
-                    double dval = pCurValue[i * istride + j * jstride];
                     fprintfOrDie(f, valueFormatString.c_str(), dval);
                 }
                 else if (formatChar == 'u') // print category as integer index
                 {
-                    unsigned int uval = (unsigned int)pCurValue[i * istride + j * jstride];
-                    fprintfOrDie(f, valueFormatString.c_str(), uval);
+                    fprintfOrDie(f, valueFormatString.c_str(), (unsigned int)dval);
                 }
                 else if (formatChar == 's') // print category as a label string
                 {
-                    size_t uval = (size_t)pCurValue[i * istride + j * jstride];
+                    size_t uval = (size_t)dval;
                     assert(uval < labelMapping.size());
                     const char * sval = labelMapping[uval].c_str();
                     fprintfOrDie(f, valueFormatString.c_str(), sval);
@@ -406,7 +408,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
         fprintfOrDie(f, "%s", sequenceEpilogue.c_str());
     } // end loop over sequences
 
-    delete[] tempArray;
+    delete[] matData;
 }
 
 // -----------------------------------------------------------------------
diff --git a/Source/Readers/LMSequenceReader/SequenceReader.cpp b/Source/Readers/LMSequenceReader/SequenceReader.cpp
index 838f56402fc8..b3759342e7dc 100644
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@@ -1506,6 +1506,7 @@ void BatchSequenceReader<ElemType>::InitFromConfig(const ConfigRecordType& reade
             }
             else
             {
+                fprintf(stderr, "LMSequenceReader: Label mapping will be created internally on the fly because the labelMappingFile was not found: %ls\n", labelPath.c_str());
                 if (wClassFile != L"")
                 {
 #if 0
@@ -1538,6 +1539,7 @@ void BatchSequenceReader<ElemType>::InitFromConfig(const ConfigRecordType& reade
                 }
                 labelInfo.mapName = labelPath;
                 labelInfo.fileToWrite = labelPath; // mapping path denotes an output: write the mapping here at the end
+                // BUGBUG: This facility is not functional. No file is being created.
             }
         }
 

From fcbc749a00d478a68445d7c2113469984c48645a Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Mon, 14 Mar 2016 21:05:04 -0700
Subject: [PATCH 17/26] (incorporated minor code-review feedback)

---
 Source/ComputationNetworkLib/ReshapingNodes.cpp | 12 ++++++------
 Source/ComputationNetworkLib/ReshapingNodes.h   |  2 +-
 Source/SGDLib/SGD.cpp                           |  4 ++--
 Source/SGDLib/SGD.h                             |  3 ++-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index 9f07164d63b2..d2860fda66c6 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -32,17 +32,17 @@ struct SequenceLengthVector
 {
     typedef vector<vector<size_t>> SequenceVector;
     typedef MBLayout::SequenceInfo SequenceInfo;
-    const SequenceVector& sequenceVector;       // 
-    const vector<SequenceInfo>& sequenceInfo;    // original sequence info (for seqId)
-    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : sequenceInfo(sequenceInfo), sequenceVector(sequenceVector) { }
-    size_t size() const { return sequenceInfo.size(); }
+    const SequenceVector& m_sequenceVector;        // vector of sequences (to get sequence length)
+    const vector<SequenceInfo>& m_sequenceInfo;    // original sequence info (for seqId)
+    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { }
+    size_t size() const { return m_sequenceInfo.size(); }
     MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
     {
         SequenceInfo seq;
-        seq.seqId = sequenceInfo[i].seqId;
+        seq.seqId = m_sequenceInfo[i].seqId;
         seq.s = i;
         seq.tBegin = 0;
-        seq.tEnd = sequenceVector[i].size();
+        seq.tEnd = m_sequenceVector[i].size();
         return seq;
     }
     void operator=(const SequenceLengthVector&) = delete;
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index dbade27997b3..7c186e9b421a 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -560,7 +560,7 @@ template class RowRepeatNode<double>;
 
 // -----------------------------------------------------------------------
 // WhereNode(cond) -- extract indices of non-0 values in a sequence
-// As this implies a runtime-vale dependent reduction in dimension, it can
+// As this implies a runtime-value dependent reduction in dimension, it can
 // only be applied to time sequences, and not other tensor dimensions.
 // The result will have a different MBLayout reflecting the shortened result sequences.
 // -----------------------------------------------------------------------
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 5b086aa709a7..17046306c497 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -73,9 +73,9 @@ void SGD<ElemType>::Train(function<ComputationNetworkPtr(DEVICEID_TYPE)> createN
 
     // set tracing flags
     for (const auto& traceNodeName : m_traceNodeNamesReal)
-        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(false);
+        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/false);
     for (const auto& traceNodeName : m_traceNodeNamesCategory)
-        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(true);
+        net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/true);
 
     TrainOrAdaptModel(startEpoch, net, loadNetworkFromCheckpoint, net, nullptr, trainSetDataReader, validationSetDataReader);
 }
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 70f0889a0114..2278f61da7a4 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -507,7 +507,8 @@ class SGD : public SGDParams
     wstring m_evalCriterionNodeName;
 
     // enable tracing. Nodes listed here get their m_traceNodeValue and m_traceNodeValueAsCategoryLabel flags set
-    vector<wstring> m_traceNodeNamesReal, m_traceNodeNamesCategory;
+    vector<wstring> m_traceNodeNamesReal;
+    vector<wstring> m_traceNodeNamesCategory;
 
     size_t m_prevChosenMinibatchSize;
     double m_lastFinishedEpochTrainLoss;

From 62ca8680c5ab054b1d56ce8b86362abd151543bf Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Tue, 15 Mar 2016 08:08:20 -0700
Subject: [PATCH 18/26] bug fix: Backprop() must be prepared to run on a node
 without gradient yet having been allocated; bug fix: HardmaxNode must not
 consider calling BackpropTo() on it an error; bug fix: DelayNodeBase must
 never deserialize the matrix row dimension

---
 .../PennTreebank/Config/S2SAutoEncoder.cntk   | 524 ++++++++----------
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  13 +-
 Source/Common/Include/TensorShape.h           |   3 +-
 .../ComputationNetworkAnalysis.cpp            |   2 +-
 .../ComputationNetworkLib/ComputationNode.cpp |  65 ++-
 .../ComputationNetworkLib/ComputationNode.h   |  42 +-
 .../ComputationNetworkLib/NonlinearityNodes.h |  23 +-
 Source/ComputationNetworkLib/RecurrentNodes.h |  30 +-
 Source/ComputationNetworkLib/ReshapingNodes.h |   2 +-
 9 files changed, 329 insertions(+), 375 deletions(-)

diff --git a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
index c8d0e722da1b..5bf886bfccdd 100644
--- a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
+++ b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
@@ -5,9 +5,9 @@
 ####################
 
 # Command line to run in debugger:
-# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run  RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run  DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data  ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config  stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/Simple.log  train=[SGD=[maxEpochs=1]]  train=[epochSize=2048]]  confVocabSize=1000  DeviceId=0  makeMode=false
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run  RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run  DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data  ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config  stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log  train=[SGD=[maxEpochs=1]]  confVocabSize=1000  DeviceId=-1  makeMode=false
 # Append this for small set:
-# trainFile=ptb.small.train.txt  validFile=ptb.small.valid.txt testFile=ptb.small.test.txt
+# train=[epochSize=2048]]  trainFile=ptb.small.train.txt  validFile=ptb.small.valid.txt testFile=ptb.small.test.txt
 
 # It implements a sequence-to-sequence based auto-encoder.
 # It encodes an entire sentence into a flat vector, and tries to regenerate it.
@@ -37,14 +37,16 @@ modelPath  = "$ModelDir$/S2SAutoEncoder.dnn"
 # uncomment the following line to write logs to a file
 #stderr=$OutputDir$/rnnOutput
 
-numCPUThreads = 1
+#numCPUThreads = 1
 
 confVocabSize = 10000
 confClassSize = 50
 useStabilizer = true
 
-trainFile = "ptb.train.txt"
+#trainFile = "ptb.train.txt"
+trainFile = "ptb.small.train.txt"
 validFile = "ptb.valid.txt"
+#validFile = "ptb.small.valid.txt"
 testFile  = "ptb.test.txt"
 
 #######################################
@@ -53,11 +55,20 @@ testFile  = "ptb.test.txt"
 
 BrainScriptNetworkBuilder = [
 
+    # import general config options from outside config values
+    vocabDim = $confVocabSize$
+    nbrClass = $confClassSize$
+
+    useStabilizer = $useStabilizer$
+    useEncoder = true                 // if false, this becomes a regular RNN
+
     # import some namespaces
-    RecurrentLSTMP = BS.RNNs.RecurrentLSTMP
     Parameters = BS.Parameters
-    Loop = BS.Loop
-    Boolean = BS.Boolean
+    Constants  = BS.Constants
+    Sequences  = BS.Sequences
+    Loop       = BS.Loop
+    Boolean    = BS.Boolean
+    RecurrentLSTMP = BS.RNNs.RecurrentLSTMP
 
     # define an LSTM with a per-sequence initialization value
     # TODO: Not currently used. Move to BS library once tested.
@@ -82,12 +93,6 @@ BrainScriptNetworkBuilder = [
         ]
     ].lstmState.h // that's the value we return
 
-    # import general config options from outside config values
-    vocabDim = $confVocabSize$
-    nbrClass = $confClassSize$
-
-    useStabilizer = $useStabilizer$
-
     embeddingDim = 300
     hiddenDim    = 200
 
@@ -102,7 +107,10 @@ BrainScriptNetworkBuilder = [
 
     # embedding
     E = Parameters.WeightParam (vocabDim, embeddingDim) # note: this is assumed to be applied transposed, hence the swapped dimensions
-    Embed (x) = TransposeTimes (E, Parameters.Stabilize (x,  enabled=useStabilizer)) # embeddings are linear, so better stabilize. We really should use BatchNorm.
+    Embed (x) = TransposeTimes (E, Parameters.Stabilize (x, enabled=useStabilizer)) # embeddings are linear, so better stabilize. We really should use BatchNorm.
+    #E = Parameters.WeightParam (embeddingDim, vocabDim) # note: this is assumed to be applied transposed, hence the swapped dimensions
+    #Embed (x) = new ComputationNode [ operation = 'LookupTable' ; inputs = (E : Parameters.Stabilize (x, enabled=useStabilizer)) ; tag = '' ]
+
     inputEmbedded  = Embed (input)
     labelsEmbedded = Embed (labels)
 
@@ -124,16 +132,18 @@ BrainScriptNetworkBuilder = [
                  /*then*/ x,                       // then copy that
                  /*else*/ FutureValue (0, result)) // else just propagate to the front
     ].result
-    thoughtVectorDim = decoderDims[decoderOutputLayer]
+    thoughtVectorDim = encoderDims[encoderOutputLayer]
 
     # decoder
     # The decoder starts with hidden state 0
     # and takes as input (thoughtVector; previous word).
+    decoderInputDim = if useEncoder then           thoughtVectorDim             + embeddingDim      else                embeddingDim
+    decoderInput    = if useEncoder then RowStack (thoughtVector : Loop.Previous (decoderFeedback)) else Loop.Previous (decoderFeedback)
     decoderOutputLayer = Length (decoderDims)-1
     decoder[i:0..decoderOutputLayer] =
         if i == 0
-        then RecurrentLSTMP (thoughtVectorDim + embeddingDim, decoderDims[i], decoderDims[i],
-                             RowStack (thoughtVector : Loop.Previous (labelsEmbedded)),
+        then RecurrentLSTMP (decoderInputDim, decoderDims[i], decoderDims[i],
+                             decoderInput,
                              enableSelfStabilization=useStabilizer)
         else RecurrentLSTMP (decoderDims[i-1], decoderDims[i], decoderDims[i],
                              decoder[i-1],
@@ -145,12 +155,34 @@ BrainScriptNetworkBuilder = [
     W(x) = Parameters.WeightParam (vocabDim, decoderDim) * Parameters.Stabilize (x, enabled=useStabilizer)
     B = Parameters.BiasParam (vocabDim)
 
-    z = W(decoderOutput) + B; // top-level input to Softmax
+    z = W(decoderOutput) + B;  // top-level input to Softmax
+
+    decoderOutputEmbedded = Embed (Hardmax (z))
+
+    # decoder feedback differs between training and test
+    isTraining = EnvironmentInput('isTraining', tag='eval')
+    #decoderFeedback = labelsEmbedded
+    # BUGBUG: This does not work:
+    decoderFeedback = Boolean.If (isTraining, labelsEmbedded, decoderOutputEmbedded)
+    # 'decoderFeedback' gets topo-sorted to the end of the loop, which is the wrong entry point, it must be way down; 'z' is the rigth entry point from the top
+
+    # exclude the first token, which is sentence start. Don't want to train on that.
+    CastAs (type, data) = Sequences.Scatter (Constants.OnesLike (type), data)
+
+    SkipFirst (x) = Sequences.Skip (1, x)
+    z1      =             SkipFirst (z)
+    labels1 = CastAs (z1, SkipFirst (labels))
 
     # training criteria
-    # The target is the full sequence including <s> and </s>.
-    ce  = CrossEntropyWithSoftmax(labels, z, tag='criterion') // this is the training objective
-    wer = ErrorPrediction(labels, z, tag='eval')              // this also gets tracked
+    ce  = CrossEntropyWithSoftmax(labels1, z1, tag='criterion') // this is the training objective
+    wer = ErrorPrediction        (labels1, z1, tag='eval')      // this also gets tracked
+
+    #indexTestVals = Plus (decoderOutput, BS.Constants.Zero, tag='eval')
+    #indexTest = RowSlice (0, 1, indexTestVals)
+    #index = Where (RectifiedLinear (indexTest), tag='eval'); // for testing: this thresholds all negative numbers to 0=false, keeping positive as !=0=true
+    #packedIndex = PackedIndex (indexTest, index, tag='eval')
+    #filtered = GatherPacked (packedIndex, indexTestVals, tag='eval')
+    #unfiltered = ScatterPacked (indexTest, packedIndex, filtered, tag='eval')
 ]
 
 #######################################
@@ -158,8 +190,10 @@ BrainScriptNetworkBuilder = [
 #######################################
 
 reader = [
-    readerType = LMSequenceReader
+    file = "$DataDir$/$trainFile$"
     #randomize = "auto" # gets ignored
+
+    readerType = LMSequenceReader
     mode = "softmax"
     nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
     cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
@@ -180,7 +214,73 @@ reader = [
     #windowSize - number of records we should include in BinaryWriter window
     windowSize = 10000
 
-    file = "$DataDir$/$trainFile$"
+    # additional features sections
+    # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below)
+    input = [
+        dim = 0     # no (explicit) labels   ...labelDim correct??
+        ### write definition
+        sectionType = "data"
+    ]
+    # labels sections
+    # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed
+    # labels sections  --this is required, but our labels are extracted from the inLabels
+    inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically
+        dim = 1
+
+        # vocabulary size
+        labelType = "category"
+        labelDim = "$confVocabSize$"
+        labelMappingFile = "$ModelDir$/vocab.wl"
+        beginSequence = "</s>"
+        endSequence   = "</s>"
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        elementSize=4
+        sectionType=labels
+        mapping = [
+          #redefine number of records for this section, since we don't need to save it for each data record
+          wrecords=11
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category = [
+          dim=11
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+    ]
+    outputDummy = [
+        labelType = "none"
+    ]
+]
+
+cvReader = [
+    file = "$DataDir$/$validFile$"
+    #randomize = "none" # gets ignored
+
+    # everything below here is duplicated from 'reader'
+    readerType = LMSequenceReader
+    mode = "softmax"
+    nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
+    cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
+
+    # word class info
+    wordclass = "$ModelDir$/vocab.txt"
+
+    #### write definition
+    # if writerType is set, we will cache to a binary file
+    # if the binary file exists, we will use it instead of parsing this file
+    #writerType = BinaryReader
+    wfile = $CacheDir$\sequenceSentence.bin
+    # if calculated size would be bigger, that is used instead
+    wsize = 256
+    #wrecords - number of records we should allocate space for in the file
+    # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
+    wrecords = 1000
+    #windowSize - number of records we should include in BinaryWriter window
+    windowSize = 10000
 
     # additional features sections
     # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below)
@@ -198,8 +298,8 @@ reader = [
         # vocabulary size
         labelType = "category"
         labelDim = "$confVocabSize$"
-        labelMappingFile = "$OutputDir$/sentenceLabels.txt"
-        beginSequence = "<s>"
+        labelMappingFile = "$ModelDir$/vocab.wl"
+        beginSequence = "</s>"
         endSequence   = "</s>"
 
         #### Write definition ####
@@ -233,6 +333,7 @@ writeWordAndClassInfo = [
     inputFile = "$DataDir$/$trainFile$"
     beginSequence = "</s>"
     endSequence   = "</s>"
+    outputMappingFile = "$ModelDir$/vocab.wl"
     outputVocabFile = "$ModelDir$/vocab.txt"
     outputWord2Cls  = "$ModelDir$/word2cls.txt"
     outputCls2Index = "$ModelDir$/cls2idx.txt"
@@ -250,14 +351,14 @@ train = [
     action = "train"
     traceLevel = 1
     epochSize = 0               # (for quick tests, this can be overridden with something small)
-    useValidation = false  # true  # TODO: need to adapt cvReader as well
 
     #BrainScriptNetworkBuilder is defined in outer scope
 
     SGD = [
-        minibatchSize = 128:256:512 # TODO: Why is this here and not inside SGD?
-        learningRatesPerSample = 0.1
-        momentumPerMB = 0
+        minibatchSize = 128:256:512
+        learningRatesPerSample = 0.01
+        #momentumPerMB = 0
+        momentumAsTimeConstant = 2500
         gradientClippingWithTruncation = true   # TODO: clip and truncate? What is the difference?
         clippingThresholdPerSample = 15.0
         maxEpochs = 16
@@ -265,6 +366,14 @@ train = [
         gradUpdateType = "none" # FSAdaGrad?
         loadBestModel = true
 
+        # tracing (enable these for debugging)
+        #traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce
+        #traceNodeNamesReal = labelsEmbedded:decoderInput:z:ce
+        #traceNodeNamesReal = thoughtVector.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining
+        #traceNodeNamesCategory = input
+
+        dropoutRate = 0.0
+
         # settings for Auto Adjust Learning Rate
         AutoAdjust = [
             autoAdjustLR = "adjustAfterEpoch"
@@ -277,115 +386,6 @@ train = [
             numPrevLearnRates = 5
             numBestSearchEpoch = 1
         ]
-
-        dropoutRate = 0.0
-    ]
-
-    # if a cvReader section is specified, SGD will use this to compute the CV criterion
-    # TODO: adapt this
-    _hidden_cvReader = [
-        # reader to use
-        readerType = "LMSequenceReader"
-        randomize = "none"
-        nbruttsineachrecurrentiter = 0  # 0 means fill up the minibatch with as many parallel sequences as fit
-        cacheBlockSize = 2000000        # just load it all
-
-        # word class info
-        wordclass = "$ModelDir$/vocab.txt"
-
-        # if writerType is set, we will cache to a binary file
-        # if the binary file exists, we will use it instead of parsing this file
-        # writerType = "BinaryReader"
-
-        # write definition
-        wfile = "$OutputDir$/sequenceSentence.valid.bin"
-        
-        # wsize - inital size of the file in MB
-        # if calculated size would be bigger, that is used instead
-        wsize = 256
-
-        # wrecords - number of records we should allocate space for in the file
-        # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
-        wrecords = 1000
-        
-        # windowSize - number of records we should include in BinaryWriter window
-        windowSize = "$confVocabSize$"
-
-        file = "$DataDir$/$validFile$"
-
-        # additional features sections
-        # for now store as expanded category data (including label in)
-        features = [
-            # sentence has no features, so need to set dimension to zero
-            dim = 0
-            # write definition
-            sectionType = "data"
-        ]
-        
-        # labels sections
-        # it should be the same as that in the training set
-        labelIn = [
-            dim = 1
-
-            # vocabulary size
-            labelDim = "$confVocabSize$"
-            labelMappingFile = "$OutputDir$/sentenceLabels.out.txt"
-            
-            labelType = "Category"
-            beginSequence = "</s>"
-            endSequence = "</s>"
-
-            # Write definition
-            # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-            
-            mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 11
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
-            ]
-            
-            category = [
-                dim = 11
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
-            ]
-        ]
-        
-        #labels sections
-        labels = [
-            dim = 1
-            
-            labelType = "NextWord"
-            beginSequence = "O"
-            endSequence = "O"
-
-            # vocabulary size
-            labelDim = "$confVocabSize$"
-            labelMappingFile = "$OutputDir$/sentenceLabels.out.txt"
-            
-            # Write definition
-            # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-
-            mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 3
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
-            ]
-            
-            category = [
-                dim = 3
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
-            ]
-        ]
     ]
 ]
 
@@ -403,104 +403,70 @@ test = [
     epochSize = 0
 
     reader = [
-        # reader to use
-        readerType = "LMSequenceReader"
-        randomize = "none"
-        nbruttsineachrecurrentiter = 0  # 0 means fill up the minibatch with as many parallel sequences as fit
-        cacheBlockSize = 2000000        # just load it all
-
+        file = "$DataDir$/$testFile$"
+        #randomize = "none" # gets ignored
+    
+        # everything below here is duplicated from 'reader'
+        readerType = LMSequenceReader
+        mode = "softmax"
+        nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
+        cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
+    
         # word class info
         wordclass = "$ModelDir$/vocab.txt"
-
+    
+        #### write definition
         # if writerType is set, we will cache to a binary file
         # if the binary file exists, we will use it instead of parsing this file
-        # writerType = "BinaryReader"
-
-        # write definition
-        wfile = "$OutputDir$/sequenceSentence.bin"
-        # wsize - inital size of the file in MB
+        #writerType = BinaryReader
+        wfile = $CacheDir$\sequenceSentence.bin
         # if calculated size would be bigger, that is used instead
         wsize = 256
-
-        # wrecords - number of records we should allocate space for in the file
+        #wrecords - number of records we should allocate space for in the file
         # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
         wrecords = 1000
-        
-        # windowSize - number of records we should include in BinaryWriter window
-        windowSize = "$confVocabSize$"
-
-        file = "$DataDir$/$testFile$"
-
+        #windowSize - number of records we should include in BinaryWriter window
+        windowSize = 10000
+    
         # additional features sections
-        # for now store as expanded category data (including label in)
-        features = [
-            # sentence has no features, so need to set dimension to zero
-            dim = 0
-            # write definition
+        # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below)
+        input = [
+            dim = 0     # no (explicit) labels   ...labelDim correct??
+            ### write definition
             sectionType = "data"
         ]
-        
-        #labels sections
-        labelIn = [
+        # labels sections
+        # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed
+        # labels sections  --this is required, but our labels are extracted from the inLabels
+        inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically
             dim = 1
-
+    
             # vocabulary size
+            labelType = "category"
             labelDim = "$confVocabSize$"
-            labelMappingFile = "$OutputDir$/sentenceLabels.txt"
-            
-            labelType = "Category"
+            labelMappingFile = "$ModelDir$/vocab.wl"
             beginSequence = "</s>"
-            endSequence = "</s>"
-
-            # Write definition
+            endSequence   = "</s>"
+    
+            #### Write definition ####
             # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-            
+            elementSize=4
+            sectionType=labels
             mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 11
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
+              #redefine number of records for this section, since we don't need to save it for each data record
+              wrecords=11
+              #variable size so use an average string size
+              elementSize=10
+              sectionType=labelMapping
             ]
-            
             category = [
-                dim = 11
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
+              dim=11
+              #elementSize=sizeof(ElemType) is default
+              sectionType=categoryLabels
             ]
         ]
-        
-        #labels sections
-        labels = [
-            dim = 1
-            labelType = "NextWord"
-            beginSequence = "O"
-            endSequence = "O"
-
-            # vocabulary size
-            labelDim = "$confVocabSize$"
-
-            labelMappingFile = "$OutputDir$/sentenceLabels.out.txt"
-            # Write definition
-            # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-            
-            mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 3
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
-            ]
-            
-            category = [
-                dim = 3
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
-            ]
+        outputDummy = [
+            labelType = "none"
         ]
     ]
 ]
@@ -527,10 +493,12 @@ write = [
 
     outputPath = "$OutputDir$/Write"
     #outputPath = "-"                    # "-" will write to stdout; useful for debugging
-    outputNodeNames = TrainNodeClassBasedCrossEntropy # when processing one sentence per minibatch, this is the sentence posterior
+    outputNodeNames = z # when processing one sentence per minibatch, this is the sentence posterior
     format = [
-        sequencePrologue = "log P(W)="    # (using this to demonstrate some formatting strings)
-        type = "real"
+        type = "category"
+        transpose = false
+        labelMappingFile = "$ModelDir$/vocab.wl"
+        #sequencePrologue = "log P(W)="    # (using this to demonstrate some formatting strings)
     ]
 
     minibatchSize = 8192                # choose this to be big enough for the longest sentence
@@ -539,104 +507,70 @@ write = [
     epochSize = 0
 
     reader = [
-        # reader to use
-        readerType = "LMSequenceReader"
-        randomize = "none"              # BUGBUG: This is ignored.
-        nbruttsineachrecurrentiter = 1  # one sentence per minibatch
-        cacheBlockSize = 1              # workaround to disable randomization
-
+        file = "$DataDir$/$testFile$"
+        #randomize = "none" # gets ignored
+    
+        # everything below here is duplicated from 'reader'
+        readerType = LMSequenceReader
+        mode = "softmax"
+        nbruttsineachrecurrentiter = 1      # 0 means auto-fill given minibatch size
+        cacheBlockSize = 1 #00000000          # read block size. This value is large enough to load entire corpus at once
+    
         # word class info
         wordclass = "$ModelDir$/vocab.txt"
-
+    
+        #### write definition
         # if writerType is set, we will cache to a binary file
         # if the binary file exists, we will use it instead of parsing this file
-        # writerType = "BinaryReader"
-
-        # write definition
-        wfile = "$OutputDir$/sequenceSentence.bin"
-        # wsize - inital size of the file in MB
+        #writerType = BinaryReader
+        wfile = $CacheDir$\sequenceSentence.bin
         # if calculated size would be bigger, that is used instead
         wsize = 256
-
-        # wrecords - number of records we should allocate space for in the file
+        #wrecords - number of records we should allocate space for in the file
         # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
         wrecords = 1000
-        
-        # windowSize - number of records we should include in BinaryWriter window
-        windowSize = "$confVocabSize$"
-
-        file = "$DataDir$/$testFile$"
-
+        #windowSize - number of records we should include in BinaryWriter window
+        windowSize = 10000
+    
         # additional features sections
-        # for now store as expanded category data (including label in)
-        features = [
-            # sentence has no features, so need to set dimension to zero
-            dim = 0
-            # write definition
+        # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below)
+        input = [
+            dim = 0     # no (explicit) labels   ...labelDim correct??
+            ### write definition
             sectionType = "data"
         ]
-        
-        #labels sections
-        labelIn = [
+        # labels sections
+        # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed
+        # labels sections  --this is required, but our labels are extracted from the inLabels
+        inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically
             dim = 1
-
+    
             # vocabulary size
+            labelType = "category"
             labelDim = "$confVocabSize$"
-            labelMappingFile = "$OutputDir$/sentenceLabels.txt"
-            
-            labelType = "Category"
+            labelMappingFile = "$ModelDir$/vocab.wl"
             beginSequence = "</s>"
-            endSequence = "</s>"
-
-            # Write definition
+            endSequence   = "</s>"
+    
+            #### Write definition ####
             # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-            
+            elementSize=4
+            sectionType=labels
             mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 11
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
+              #redefine number of records for this section, since we don't need to save it for each data record
+              wrecords=11
+              #variable size so use an average string size
+              elementSize=10
+              sectionType=labelMapping
             ]
-            
             category = [
-                dim = 11
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
+              dim=11
+              #elementSize=sizeof(ElemType) is default
+              sectionType=categoryLabels
             ]
         ]
-        
-        #labels sections
-        labels = [
-            dim = 1
-            labelType = "NextWord"
-            beginSequence = "O"
-            endSequence = "O"
-
-            # vocabulary size
-            labelDim = "$confVocabSize$"
-
-            labelMappingFile = "$OutputDir$/sentenceLabels.out.txt"
-            # Write definition
-            # sizeof(unsigned) which is the label index type
-            elementSize = 4
-            sectionType = "labels"
-            
-            mapping = [
-                # redefine number of records for this section, since we don't need to save it for each data record
-                wrecords = 3
-                # variable size so use an average string size
-                elementSize = 10
-                sectionType = "labelMapping"
-            ]
-            
-            category = [
-                dim = 3
-                # elementSize = sizeof(ElemType) is default
-                sectionType = "categoryLabels"
-            ]
+        outputDummy = [
+            labelType = "none"
         ]
     ]
 ]
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 94108f9d3853..7eb279bf4e88 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -191,8 +191,8 @@ Sequences = [
 
     # Gather and Scatter
     # We go through 3 nodes each to take advantage of x
-    Gather  (cond, x) =  GatherPacked (      PackedIndex (x, Where (cond)), x)
-    Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y)
+    Gather  (cond, x) =  GatherPacked (      PackedIndex (x, Where (cond)), x) # 'cond' matches 'x'
+    Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y) # 'cond' matches the result
 
     # sequence-altering LINQ-like operators
     # These generate new data packing (MBLayouts)
@@ -214,10 +214,10 @@ Sequences = [
         selected = Loop._IsWithin (DelayFn, N, x)
         out = Gather (selected, x)
     ].out
-    Skip (N, x) = _Skip (PastValue, N, x)
+    Skip (N, x) = if N > 0 then _Skip (PastValue, N, x) else x
     _Skip (DelayFn, N, x) = [ // TODO: merge with _Take
         selected = Loop._IsWithin (DelayFn, N, x)
-        out = Gather (!selected, x)
+        out = Gather (Boolean.Not (selected), x)
     ].out
     ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault
         startMask = Skip (n, x)                     // ...000111...
@@ -228,10 +228,9 @@ Sequences = [
 
     #FirstOrDefault (x) = ? // can empty sequences exist or even be represented by CNTK?
 
-    #Last (x)  = _Take (FutureValue, 1, x)
-
     Average (x) = Sum (x) / Loop.Count(x)  // TODO: patch opQuotient to check 0/0 = 0
-    Sum (x) = FoldL (Plus, 0, x)
+    Sum (x)    = FoldL (Plus,    0, x)
+    LogSum (x) = FoldL (LogPlus, 0, x)
     #Max (x) = FoldL (^.Max, ?, x) // TODO: name clash; need to implement ^.
     #Min (x) = FoldL (^.Min, ?, x) // TODO: what's the init value?
     All (x) = FoldL (Boolean.And,  OnesLike (x), x)
diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h
index cf5ef0dd3d0b..79495e2e462f 100644
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@@ -785,4 +785,5 @@ struct ImageDimensions
         return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind);
     }
 };
-} } }
+
+}}}
diff --git a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
index 9cc89cffa333..4db811c7ce57 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
@@ -91,7 +91,7 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
             const auto& node = iter->m_nestedNodes[j];
             for (size_t i = 0; i < node->GetNumInputs(); i++)
             {
-                if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0)
+                if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
                 {
                     // assert(node->Input(i)->m_indexInLoop == 0);                    // No. It seems this variable really counts the number of parents.
                     node->Input(i)->m_indexInLoop++; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 1a3fcf792368..e0f63046baa2 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -18,6 +18,59 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 using namespace std;
 
+// -----------------------------------------------------------------------
+// subroutines for evaluation
+// -----------------------------------------------------------------------
+
+template<class ElemType>
+void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/
+{
+    // Normally our gradient matrix was created as an input of another node.
+    // This does not happen though in the special case of a node inside a loop
+    // that no consumer outside depends on. Those might get topologically sorted
+    // after nodes that propagate outside of the loop, and thus, in the last
+    // time step of the sequence, have not yet received a gradient from a parent
+    // and thus may not have had their gradient matrices allocated.
+    if (m_needsGradient)
+        LazyZeroGradient(); // set gradient to 0 if this is the first time
+
+    if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
+        LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
+
+    for (size_t i = 0; i < m_inputs.size(); i++)
+    {
+        ComputationNodePtr child = Input(i);
+        if (child->m_needsGradient &&
+            ((childrenInThisLoop  && child->IsPartOfLoop() == IsPartOfLoop()) ||
+             (childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) ))
+        {
+            // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
+            if (!m_needsGradient)
+                LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
+#if DUMPOUTPUT
+            fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
+#endif
+            child->LazyZeroGradient(); // set gradient to 0 if this is the first time
+
+            // If we propagate from a loop to a node that is outside the loop, we are not efficient.
+            // This case is handled by SEQTraversalFlowControlNode::Backprop().
+            // The check below is to verify that.
+            if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames())
+            {
+                LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
+                           NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
+            }
+
+            // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
+            BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
+        }
+#ifdef DISPLAY_DEBUG
+        else
+            fprintf(stderr, "    [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, child->OperationName().c_str(), child->NodeName().c_str());
+#endif
+    }
+}
+
 // -----------------------------------------------------------------------
 // subroutines for Validate() implementations
 // -----------------------------------------------------------------------
@@ -116,10 +169,14 @@ void ComputationNodeBase::ValidateBinaryReduce(bool isFinalValidationPass)
     ComputationNodeBase::Validate(isFinalValidationPass);
     m_pMBLayout = nullptr; // this node does not hold mini-batch data
     ValidateInferBinaryInputDims();
-    if (isFinalValidationPass &&
-        !(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) && // TODO: Do we need broadcasting for these cases?
-          (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout())))
-        LogicError("The Matrix dimensions or MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+    if (isFinalValidationPass)
+    {
+        // inputs must have identical layouts and must be minibatch data
+        if (!(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout())))
+            LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+        if (Input(0)->GetMBLayout() != Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout())
+            LogicError("The MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str());
+    }
     SetDims(TensorShape(1), false);
 }
 
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 976c83cd72db..ba0365d05dfd 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -1339,47 +1339,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot
 
     // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
     // TODO: move to -Base (or -Network?)
-    void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override
-    {
-        if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
-            LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());
-
-        for (size_t i = 0; i < m_inputs.size(); i++)
-        {
-            ComputationNodePtr child = Input(i);
-            if (child->m_needsGradient &&
-                (childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop() ||
-                 childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()))
-            {
-                // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
-                if (!m_needsGradient)
-                    LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
-#ifdef DISPLAY_DEBUG
-                fprintf(stderr, "    [%lu]: %ls(%ls)\n", i, child->OperationName().c_str(), child->NodeName().c_str());
-#endif
-#if DUMPOUTPUT
-                fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
-#endif
-                child->LazyZeroGradient(); // set gradient to 0 if this is the first time
-
-                // If we propagate from a loop to a node that is outside the loop, we are not efficient.
-                // This case is handled by SEQTraversalFlowControlNode::Backprop().
-                // The check below is to verify that.
-                if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames())
-                {
-                    LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
-                               NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
-                }
-
-                // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
-                BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
-            }
-#ifdef DISPLAY_DEBUG
-            else
-                fprintf(stderr, "    [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, child->OperationName().c_str(), child->NodeName().c_str());
-#endif
-        }
-    }
+    void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override;
 
     // TODO: why of the inputs, and not the node itself?
     void /*ComputationNodeBase::*/ ZeroGradientsOfInputs() override // clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily)
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index a2cd5421d631..a6f8f6e2cb3d 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -369,25 +369,18 @@ class HardmaxNode : public SoftmaxNodeBase /*ComputationNode*/<ElemType>
 
     /*virtual*/ void BackpropToV(Matrix<ElemType>& gradient, const Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& functionValues) override
     {
-        gradient;
-        inputFunctionValues;
-        inputGradientValues;
-        gradientValues;
-        LogicError("Hardmax is not differentiable and is used for evaluation only.");
+        gradient; inputFunctionValues; inputGradientValues; gradientValues;
+        // Hardmax cannot back-propagate a gradient.
+        // We must not forbid this function to be called, though, since Hardmax may be running
+        // as part of a recurrent decoding loop. Sequence-to-sequence models run the Hardmax
+        // node inside the training without back-propagating into them.
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        return false;
-    }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
-    {
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
 
     /*virtual*/ void ForwardPropV(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues) override
     {
-        // TODO: temp solution, we need to write a math function specifically for this
         functionValues.AssignHardmaxOf(inputFunctionValues, true);
     }
 };
@@ -395,4 +388,4 @@ class HardmaxNode : public SoftmaxNodeBase /*ComputationNode*/<ElemType>
 template class HardmaxNode<float>;
 template class HardmaxNode<double>;
 
-} } }
+}}}
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index feb7e485a20b..caa557b570d8 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -149,8 +149,11 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         Base::Save(fstream);
 
         fstream << m_timeStep;
-        size_t colsDummy = 0;
-        fstream << GetSampleMatrixNumRows() << colsDummy; // #rows saved for legacy file format
+#if CURRENT_CNTK_MODEL_VERSION > CNTK_MODEL_VERSION_3
+        m_sampleLayout.Save(fstream);
+#else
+        fstream << (size_t)0 << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors
+#endif
 
         fstream << m_initialActivationValue;
     }
@@ -162,14 +165,21 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
 
         fstream >> m_timeStep;
 
-        size_t rows, colsDummy;
-        fstream >> rows >> colsDummy;
+        if (modelVersion > CNTK_MODEL_VERSION_3)
+        {
+            TensorShape sampleLayout;
+            sampleLayout.Load(fstream);
+            SetDims(sampleLayout, HasMBLayout() /*may be true on reload (roll-back)*/);
+        }
+        else
+        {
+            size_t rows, colsDummy;
+            fstream >> rows >> colsDummy;
 
-        // BUGBUG: I got an error in when reloading persistent parameterse for a model that had dimension specified as 0, which did not get re-inferred correctly.
-        //         We should either simply not write this parameter out at all (since it can always be inferred), or write the tensor shape.
-        if (GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date
-            SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
-        m_delayedValue.Resize(rows, 0);                                                  // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
+            if (rows != 0 && GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date
+                SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
+        }
+        m_delayedValue.Resize(m_sampleLayout.GetNumElements(), 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
 
         if (modelVersion >= CNTK_MODEL_VERSION_2)
             fstream >> m_initialActivationValue;
@@ -1171,4 +1181,4 @@ class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, publi
 
 #endif
 
-} } }
+}}}
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 7c186e9b421a..def5932c48a8 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -196,7 +196,7 @@ class ReconcileMBLayoutNode : public ComputationNode<ElemType>, public NumInputs
         // enforce compatibility of 'dataInput' with 'layoutInput'
         // TODO: how to deal with boundary flags?
         if (*m_pMBLayout != *Input(0)->GetMBLayout()) // this does a deep value-level comparison
-            InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompaitble with that of %ls %ls.",
+            InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompatible with that of %ls %ls.",
                             NodeName().c_str(), OperationName().c_str(),
                             Input(0)->NodeName().c_str(), Input(0)->OperationName().c_str(),
                             Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str());

From b2f9efb735257211b0d84fcd63548aeaf9b58ad0 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 08:29:43 -0700
Subject: [PATCH 19/26] bug fix: Gather/Scatter-related nodes must pass actual
 HasMBLayout() flag during validation

---
 Source/ComputationNetworkLib/ReshapingNodes.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index d2860fda66c6..5c2afc243f10 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -172,7 +172,7 @@ template <class ElemType>
     if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
         InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str());
 
-    SetDims(Input(INDEXDATA));
+    SetDims(Input(INDEXDATA)->GetSampleLayout(), HasMBLayout());
 }
 
 template class PackedIndexNode<float>;
@@ -218,7 +218,7 @@ template <class ElemType>
         InvalidArgument("%ls %ls operation requires the first argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str());
 
     // inherit tensor dimension from sourceData
-    SetDims(Input(SOURCEDATA));
+    SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
 }
 
 template class GatherPackedNode<float>;
@@ -268,7 +268,7 @@ template <class ElemType>
     // TODO: We also know that indexData and sourceData must have the same MBLayout. But that is checked at runtime.
 
     // inherit tensor dimension from sourceData
-    SetDims(Input(SOURCEDATA));
+    SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
 }
 
 template class ScatterPackedNode<float>;

From b499cc9752d791780ba28590103f49aa164f4a7c Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 10:03:16 -0700
Subject: [PATCH 20/26] test of Backprop change

---
 Source/ComputationNetworkLib/ComputationNode.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index 9d0282875803..955fee00752f 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -31,8 +31,8 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
     // after nodes that propagate outside of the loop, and thus, in the last
     // time step of the sequence, have not yet received a gradient from a parent
     // and thus may not have had their gradient matrices allocated.
-    if (m_needsGradient)
-        LazyZeroGradient(); // set gradient to 0 if this is the first time
+    //if (m_needsGradient)
+    //    LazyZeroGradient(); // set gradient to 0 if this is the first time
 
     if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop)
         LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str());

From 36566d1bf7d6ba6de33334fe2d5ee78b0cb6c5fb Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 14:20:14 -0700
Subject: [PATCH 21/26] BrainScriptNetworkBuilder now accepts any BS
 expression, instead of just a config record for ComputationNetwork. This
 allows to use 'include' for network definitions and will enable network
 editing on the fly.

---
 Source/ActionsLib/NetworkFactory.cpp          | 28 +++++++++++++++----
 .../CNTK/BrainScript/BrainScriptEvaluator.cpp |  2 +-
 Source/CNTK/BrainScript/BrainScriptParser.cpp | 21 ++++----------
 Source/CNTK/BrainScript/BrainScriptParser.h   |  1 +
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/Source/ActionsLib/NetworkFactory.cpp b/Source/ActionsLib/NetworkFactory.cpp
index d02b869829da..5fb4a02a11b5 100644
--- a/Source/ActionsLib/NetworkFactory.cpp
+++ b/Source/ActionsLib/NetworkFactory.cpp
@@ -75,12 +75,28 @@ function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory(const ConfigRec
         // We interface with outer old CNTK config by taking the inner part, which we get as a string, as BrainScript.
         // We prepend a few standard definitions, and also definition of deviceId and precision, which all objects will pull out again when they are being constructed.
         // BUGBUG: We are not getting TextLocations right in this way! Do we need to inject location markers into the source? Moot once we fully switch to BS
-        wstring sourceCode = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder");
-        auto configDirs = ConfigParameters::GetBrainScriptNetworkBuilderIncludePaths();
-        let expr = BS::ParseConfigDictFromString(L"include \'cntk.core.bs\'"     // Note: Using lowercase here to match the Linux name of the CNTK exe.
-                                                 + msra::strfun::wstrprintf(L"deviceId = %d ; precision = '%ls' ; network = new ComputationNetwork ", (int)deviceId, ElemTypeName<ElemType>())
-                                                 + sourceCode,      // source code has the form [ ... ] with brackets in the string
-                                                 move(configDirs)); // set include paths to all paths that configs were read from; no additional configurable include paths are supported by BrainScriptNetworkBuilder
+        wstring sourceOfNetwork = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder");
+        if (sourceOfNetwork.find_first_of(L"([") != 0)
+            InvalidArgument("BrainScript network description must be either a BS expression in ( ) or a config record in [ ]");
+
+        // set the include paths to all paths that configs were read from; no additional configurable include paths are supported by BrainScriptNetworkBuilder
+        auto includePaths = ConfigParameters::GetBrainScriptNetworkBuilderIncludePaths();
+
+        // inject additional items into the source code
+        // We support two ways of specifying the network in BrainScript:
+        //  - BrainScriptNetworkBuilder = ( any BS expression that evaluates to a ComputationNetwork )
+        //  - BrainScriptNetworkBuilder = [ constructor parameters for a ComputationNetwork ]
+        if (sourceOfNetwork[0] == '[') // if [ ] form then we turn it into ComputationNetwork by constructing a ComputationNetwork from it
+            sourceOfNetwork = L"new ComputationNetwork " + sourceOfNetwork;
+        let sourceOfBS = msra::strfun::wstrprintf(L"include \'cntk.core.bs\'\n" // include our core lib. Note: Using lowercase here to match the Linux name of the CNTK exe.
+                                                  L"deviceId = %d\n"            // deviceId as passed in
+                                                  L"precision = '%ls'\n"        // 'float' or 'double'
+                                                  L"network = %ls",             // source code of expression that evaluates to a ComputationNetwork
+                                                  (int)deviceId, ElemTypeName<ElemType>(), sourceOfNetwork.c_str());
+        let expr = BS::ParseConfigDictFromString(sourceOfBS, move(includePaths));
+
+        // the rest is done in a lambda that is only evaluated when a virgin network is needed
+        // Note that evaluating the BrainScript *is* instantiating the network, so the evaluate call must be inside the lambda.
         return [expr](DEVICEID_TYPE /*deviceId*/)
         {
             // evaluate the parse tree, particularly the top-level field 'network'
diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
index d333c1c679eb..5f08733f45ad 100644
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@@ -768,7 +768,7 @@ void Do(ExpressionPtr e)
 
 shared_ptr<Object> EvaluateField(ExpressionPtr e, const wstring &id)
 {
-    return RecordLookup(e, id, e->location, nullptr /*scope for evaluating 'e'*/, L"$"); // we evaluate the member 'do'
+    return RecordLookup(e, id, e->location, nullptr /*scope for evaluating 'e'*/, L""); // we evaluate the member 'do'
 }
 
 ConfigValuePtr Evaluate(ExpressionPtr e)
diff --git a/Source/CNTK/BrainScript/BrainScriptParser.cpp b/Source/CNTK/BrainScript/BrainScriptParser.cpp
index dab31e32b782..42489c0a5f57 100644
--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@@ -651,24 +651,13 @@ class Parser : public Lexer
         : Lexer(move(includePaths))
     {
         infixPrecedence = map<wstring, int>{
-            {L".", 100}, {L"[", 100}, {L"(", 100}, // also sort-of infix operands...
-            {L"*", 10},
-            {L"/", 10},
-            {L".*", 10},
-            {L"**", 10},
-            {L"%", 10},
-            {L"+", 9},
-            {L"-", 9},
-            {L"with", 9},
-            {L"==", 8},
-            {L"!=", 8},
-            {L"<", 8},
-            {L"<=", 8},
-            {L">", 8},
-            {L">=", 8},
+            {L".", 99}, {L"[", 99}, {L"(",   99}, // also sort-of infix operands...
+            {L"*", 10}, {L"/", 10}, {L".*",  10}, {L"**", 10}, {L"%", 10},
+            {L"+",  9}, {L"-",  9}, {L"with", 9}, {L"==",  8},
+            {L"!=", 8}, {L"<",  8}, {L"<=",   8}, {L">",   8}, {L">=", 8},
             {L"&&", 7},
             {L"||", 6},
-            {L":", 5},
+            {L":",  5},
             {L"=>", 0},
         };
         SetSourceFile(move(sourceFile));
diff --git a/Source/CNTK/BrainScript/BrainScriptParser.h b/Source/CNTK/BrainScript/BrainScriptParser.h
index 8e307ecca6fd..89dc12cdce73 100644
--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@@ -135,6 +135,7 @@ typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular def
 
 // access the parser through one of these functions
 ExpressionPtr ParseConfigDictFromString(wstring text, vector<wstring>&& includePaths);          // parses a list of dictionary members, returns a dictionary expression
+// TODO: These rvalue references are no longer adding value, change to const<>&
 //ExpressionPtr ParseConfigDictFromFile(wstring path, vector<wstring> includePaths);              // likewise, but from a file path
 ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector<wstring>&& includePaths); // parses a single expression from sourceText, which is meant to contain an include statement, hence includePaths
 

From c153a121ee50eccfadea3f91ec63017c5ca1040c Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 17:07:29 -0700
Subject: [PATCH 22/26] PastValue now serializes tensor dimensions (required to
 bump the file-format version to 4)

---
 Source/ComputationNetworkLib/ComputationNode.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index ba0365d05dfd..69f9cd8215d0 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -32,7 +32,8 @@
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3
+#define CNTK_MODEL_VERSION_4 4 // PastValue
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4
 
 extern bool g_shareNodeValueMatrices;
 

From b371c5f0fee6cd161d29e4f2458799fe13dbdaba Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 18:59:57 -0700
Subject: [PATCH 23/26] bug fix: DelayNode must save its dimensions, as
 inferrence does not work reliably otherwise; temporarily rolled back
 file-format version to test the above fix

---
 Source/ComputationNetworkLib/ComputationNode.h |  2 +-
 Source/ComputationNetworkLib/RecurrentNodes.h  |  9 +++++----
 Source/ComputationNetworkLib/ReshapingNodes.h  | 13 +++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 69f9cd8215d0..b4a40e65f580 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -33,7 +33,7 @@
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
 #define CNTK_MODEL_VERSION_4 4 // PastValue
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3
 
 extern bool g_shareNodeValueMatrices;
 
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index caa557b570d8..11cd375a5521 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -152,7 +152,7 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
 #if CURRENT_CNTK_MODEL_VERSION > CNTK_MODEL_VERSION_3
         m_sampleLayout.Save(fstream);
 #else
-        fstream << (size_t)0 << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors
+        fstream << GetSampleLayout().GetNumElements() << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors
 #endif
 
         fstream << m_initialActivationValue;
@@ -175,9 +175,10 @@ class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrent
         {
             size_t rows, colsDummy;
             fstream >> rows >> colsDummy;
-
-            if (rows != 0 && GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date
-                SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
+            // legacy format: if #rows matches then assume current tensor shape is up to date
+            // BUGBUG: This fails for non-column tensors. It should be sufficient to set
+            //         these to 0 and rely on Validate(), but some unknown nodes in the loop don't do that right.
+            SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()
         }
         m_delayedValue.Resize(m_sampleLayout.GetNumElements(), 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
 
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index def5932c48a8..055c09df6df9 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -565,6 +565,19 @@ template class RowRepeatNode<double>;
 // The result will have a different MBLayout reflecting the shortened result sequences.
 // -----------------------------------------------------------------------
 
+/* Notes on Where(), PackedIndex(), and Gather-/ScatterPacked():
+This is one of the few nodes that creates new MBLayouts inside this system.
+This node is meant to operate jointly with PackedIndexNode.
+The difference between Index and PackedIndex is that Index is in human-readable
+form referring to indices WITHIN a sequence (since NDL and BS only talk about individual
+sequences and never expose anything cross-sequence, except for aggregates like CE or BN.
+PackedIndex maps that to the internal lookup table that has strides resolved etc.
+The reason that PackedIndex is separate from Gather/ScatterPacked is that the GPU has no
+access to the STL-heavy MBLayout. So PackedIndex applies the relevant information from
+the MBLayout into a GPU object that then drives the memory-copy operations in Gather()
+and Scatter().
+*/
+
 template <class ElemType>
 class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {

From 577b748634299d191a6436c45d5eedf2ecd4dc15 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 16 Mar 2016 19:03:19 -0700
Subject: [PATCH 24/26] bumped model version back up to 4

---
 Source/ComputationNetworkLib/ComputationNode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index b4a40e65f580..69f9cd8215d0 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -33,7 +33,7 @@
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
 #define CNTK_MODEL_VERSION_4 4 // PastValue
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4
 
 extern bool g_shareNodeValueMatrices;
 

From 95878a616322a8fdd8e801e2c9a3c1dfebe26304 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 17 Mar 2016 12:55:49 -0700
Subject: [PATCH 25/26] (minor comment edits)

---
 Source/ComputationNetworkLib/LinearAlgebraNodes.h | 8 +++++---
 Source/ComputationNetworkLib/ReshapingNodes.h     | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
index bf4b9f22e461..56a471273b8d 100644
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@@ -389,6 +389,9 @@ template class TimesNode<double>;
 // Right operand and output can have MB layout, while left operand cannot.
 // This differs from TimesNode in that A is transposed, where A must be a
 // rank-1 or rank-2 tensor.
+// A common use of transposition is trace(X'X) where X is a matrix of samples.
+// This can NOT be implemented with this node. Instead, use
+// SumColumnElements (ElementTimes (X, X))
 // -----------------------------------------------------------------------
 
 template <class ElemType>
@@ -698,7 +701,7 @@ template class SumColumnElementsNode<float>;
 template class SumColumnElementsNode<double>;
 
 // -----------------------------------------------------------------------
-// TransposeDimensionsNode (input, dim1, dim2)
+// TransposeDimensions (input, dim1, dim2)
 //  - swaps index dimensions dim1 and dim2. The values are 1-based; 1 stands for the leading dimension.
 //  - new dimensions can be created; e.g. a column vector can be transposed into a row vector, which is a [1 x N] tensor
 //  - transposing into the time dimension is currently not supported
@@ -710,8 +713,7 @@ template class SumColumnElementsNode<double>;
 template <class ElemType>
 class TransposeDimensionsNode : public ComputationNode /*ComputationNode*/<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
     static const std::wstring TypeName() { return L"TransposeDimensions"; }
 
 public:
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 055c09df6df9..7722a283a4e6 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -334,7 +334,7 @@ template class RowSliceNode<float>;
 template class RowSliceNode<double>;
 
 // -----------------------------------------------------------------------
-// RowStackNode (input0, input1, ...)
+// RowStack (input0, input1, ...)
 // stacks multiple inputs on top of each other
 // The inputs will be spliced w.r.t. their first tensor dimension (the "row" dimension).
 // TODO: This is very close to the planned SpliceNode (just make m_spliceDim actually configurable) except for splicing along time.

From 66d7aab044f93b607d0ad767b887dc78c46256a3 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Thu, 17 Mar 2016 14:48:33 -0700
Subject: [PATCH 26/26] Fix run-test permissions

---
 .../ParallelTraining/NoQuantization/DoublePrecision/run-test      | 0
 .../ParallelTraining/NoQuantization/SinglePrecision/run-test      | 0
 Tests/EndToEndTests/ModelExport/Model0/run-test                   | 0
 Tests/EndToEndTests/ModelExport/Model1/run-test                   | 0
 Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test          | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test
 mode change 100644 => 100755 Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test
 mode change 100644 => 100755 Tests/EndToEndTests/ModelExport/Model0/run-test
 mode change 100644 => 100755 Tests/EndToEndTests/ModelExport/Model1/run-test
 mode change 100644 => 100755 Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test

diff --git a/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test b/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test
old mode 100644
new mode 100755
diff --git a/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test b/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test
old mode 100644
new mode 100755
diff --git a/Tests/EndToEndTests/ModelExport/Model0/run-test b/Tests/EndToEndTests/ModelExport/Model0/run-test
old mode 100644
new mode 100755
diff --git a/Tests/EndToEndTests/ModelExport/Model1/run-test b/Tests/EndToEndTests/ModelExport/Model1/run-test
old mode 100644
new mode 100755
diff --git a/Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test b/Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test
old mode 100644
new mode 100755