From 30f57f2cc843e44476f25617526c9f1c3791bc17 Mon Sep 17 00:00:00 2001
From: Chris Basoglu <cbasoglu@microsoft.com>
Date: Fri, 26 Jun 2015 09:52:36 -0700
Subject: [PATCH 1/6] Add dynamic minibatch sizing.

---
 Common/Include/DataReader.h                   |  23 +-
 Common/Include/commandArgUtil.h               |  46 +-
 .../lyx/CNTKBook_CNTK_Chapter.lyx             |  96 ++++
 MachineLearning/CNTK/SGD.h                    | 489 ++++++++++++++----
 4 files changed, 526 insertions(+), 128 deletions(-)
diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h
index 79909331a1db..709b49f374b8 100644
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@@ -29,9 +29,16 @@
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 
-const size_t randomizeAuto = ((size_t)-1)>>2; // randomize range set automatically, parameter value for Init()
-const size_t randomizeNone = 0;  // don't randomize, parameter value for Init()
-const size_t requestDataSize = randomizeAuto;   // StartMinibatchLoop default parameter, sets number of requested frames equal to the number of frames in the dataset
+// randomize range set automatically, parameter value for Init()
+const size_t randomizeAuto = ((size_t) -1) >> 2;
+
+// don't randomize, parameter value for Init()
+const size_t randomizeNone = 0;
+
+// StartMinibatchLoop default parameter, sets number of requested
+// frames equal to the constant 3fffffffffffffff computed by ((size_t) -1) >> 2 above.
+// We use this constant as a stand in for the total number of frames in the dataset.
+const size_t requestDataSize = randomizeAuto;
 
 enum EndDataType
 {
@@ -52,7 +59,7 @@ class DATAREADER_API IDataReader
 
     virtual void Init(const ConfigParameters& config) = 0;
     virtual void Destroy() = 0;
-    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize) = 0;
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) = 0;
     virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) = 0;
     virtual size_t NumberSlicesInEachRecurrentIter() = 0; 
     virtual void SetNbrSlicesEachRecurrentIter(const size_t) = 0;
@@ -80,7 +87,7 @@ class DataReader : public IDataReader<ElemType>, protected Plugin
     typedef typename IDataReader<ElemType>::LabelType LabelType;
     typedef typename IDataReader<ElemType>::LabelIdType LabelIdType;
 private:
-    IDataReader<ElemType> *m_dataReader;  // reader
+    IDataReader<ElemType>* m_dataReader;  // reader
 
     // Init - Reader Initialize for multiple data sets
     // config - [in] configuration parameters for the datareader
@@ -123,7 +130,7 @@ class DataReader : public IDataReader<ElemType>, protected Plugin
     // mbSize - [in] size of the minibatch (number of frames, etc.)
     // epoch - [in] epoch number for this loop
     // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
-    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
 
     // GetMinibatch - Get the next minibatch (features and labels)
     // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix, 
@@ -152,10 +159,10 @@ class DataReader : public IDataReader<ElemType>, protected Plugin
     //                  [out] size of buffer filled with data
     // recordStart - record to start reading from, defaults to zero (start of data)
     // returns: true if data remains to be read, false if the end of data was reached
-    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);
 
     virtual bool DataEnd(EndDataType endDataType);
-    void SetSentenceEndInBatch(std::vector<size_t> &sentenceEnd);
+    void SetSentenceEndInBatch(std::vector<size_t>& sentenceEnd);
 };
 
 }}}
diff --git a/Common/Include/commandArgUtil.h b/Common/Include/commandArgUtil.h
index 507a4cb1240c..c54a65fcd393 100644
--- a/Common/Include/commandArgUtil.h
+++ b/Common/Include/commandArgUtil.h
@@ -612,6 +612,7 @@ class ConfigParser
                 // pop out of content level
                 contentLevel = false;
             }
+
             if (quoteFound)
             {
                 // skip the closing quote
@@ -660,7 +661,7 @@ class ConfigParser
     std::string ReadConfigFiles(const std::string& filePaths);
     std::string ReadConfigFiles(const std::wstring& filePaths);
     std::string ResolveIncludeStatements(const std::string& configString, std::vector<std::string>& resolvedConfigFiles);
-    void LoadConfigFile(const std::wstring & filePath);
+    void LoadConfigFile(const std::wstring& filePath);
     void LoadConfigFileAndResolveVariables(const std::wstring& filePath, const ConfigParameters& config);
     void LoadConfigFiles(const std::wstring& filePaths, const std::string* configStringToAppend = nullptr);
 
@@ -873,17 +874,17 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary
     }
 
     // Insert - insert an 'name=value' string into the dictionary
-    void Insert(const std::string &str)
+    void Insert(const std::string& str)
     {
         ParseValue(str, 0, str.length());
     }
 
-    bool Exists(const std::wstring & name) const
+    bool Exists(const std::wstring& name) const
     {
         return Exists(msra::strfun::utf8(name));
     }
 
-    bool Exists(const std::string & name) const
+    bool Exists(const std::string& name) const
     {
         if (find(name) != end())
         {
@@ -899,42 +900,42 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary
     }
 
     // ExistsCurrent - check to see if a key exists in THIS config, don't check parent
-    bool ExistsCurrent(const std::string & name) const
+    bool ExistsCurrent(const std::string& name) const
     {
         return (find(name) != end());
     }
 
     // dict(name, default) for strings
-    ConfigValue operator()(const std::wstring & name,
-                           const wchar_t *defaultvalue) const
+    ConfigValue operator()(const std::wstring& name,
+                           const wchar_t* defaultvalue) const
     {
         return operator()(msra::strfun::utf8(name), defaultvalue);
     }
 
     // dict(name, default) for strings
-    ConfigValue operator()(const std::string & name,
-                           const wchar_t *defaultvalue) const
+    ConfigValue operator()(const std::string& name,
+                           const wchar_t* defaultvalue) const
     {
         return operator()(name, msra::strfun::utf8(defaultvalue).c_str());
     }
 
     // dict(name, default) for strings
-    ConfigValue operator()(const std::wstring & name,
-                           const char *defaultvalue) const
+    ConfigValue operator()(const std::wstring& name,
+                           const char* defaultvalue) const
     {
         return operator()(msra::strfun::utf8(name), defaultvalue);
     }
 
     // dict(name, default) for strings
-    ConfigValue operator()(const std::string & name,
-                           const char *defaultvalue) const
+    ConfigValue operator()(const std::string& name,
+                           const char* defaultvalue) const
     {
         ConfigValue value = Find(name, defaultvalue);
         return value;
     }
 
-    ConfigValue Find(const std::string & name,
-                     const char *defaultvalue = NULL) const
+    ConfigValue Find(const std::string& name,
+                     const char* defaultvalue = NULL) const
     {
         auto iter = find(name);
         ConfigValue result;
@@ -975,10 +976,11 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary
     //     any whitespace characters.  If an opening "$" is found without a closing "$", an exception is thrown.
     // configString - the string that you would like to resolve variables in.
     // returns: A copy of 'configString' with all the variables resolved.
-    std::string ResolveVariablesInSingleLine(const std::string &configLine) const
+    std::string ResolveVariablesInSingleLine(const std::string& configLine) const
     {
         // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
-        if (configLine.find_first_of("\n") != std::string::npos) {
+        if (configLine.find_first_of("\n") != std::string::npos)
+        {
             throw std::logic_error(
                 "\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");
         }
@@ -1053,7 +1055,7 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary
     //     we shouldn't insert newlines where they didn't already exist.
     // configString - the string that you would like to resolve variables in.
     // returns: A copy of 'configString' with all the variables resolved.
-    std::string ResolveVariables(const std::string &configString) const
+    std::string ResolveVariables(const std::string& configString) const
     {
         std::string newConfigString;
         if (configString.find_first_of("\n") != std::string::npos)
@@ -1347,14 +1349,14 @@ class argvector: public std::vector<T>
             RuntimeError("argvector: invalid arg value");
         }
     }
-    static void parse(const std::wstring & in, std::wstring & val)
+    static void parse(const std::wstring& in, std::wstring& val)
     {
         val = in;
     }
 
 public:
     // constructor --construct empty, then assign a wstring from command-line argument
-    void operator=(const std::wstring & arg)
+    void operator=(const std::wstring& arg)
     {
         clear();
         // separate the arguments
@@ -1387,7 +1389,7 @@ class argvector: public std::vector<T>
     }
 
     // constructor --use this for setting default values
-    argvector(const std::wstring & arg)
+    argvector(const std::wstring& arg)
     {
         *this = arg;
     }
@@ -1438,7 +1440,7 @@ class argvector: public std::vector<T>
     }
 
     // we give full read access to the vector, so we can use it bounded as well
-    const std::vector<T> & tovector() const
+    const std::vector<T>& tovector() const
     {
         return *this;
     }
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
index 9779be49f391..41ea99898cdf 100644
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@@ -1725,6 +1725,102 @@ numBestSearchEpoch
  
 \end_layout
 
+\begin_layout Standard
+Used in the Adaptive Minibatch Sizing mode.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+numMiniBatch4LRSearch
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+numMiniBatch4LRSearch
+\end_layout
+
+\end_inset
+
+: the number of minibatches used to search the minibatch size when
+in adaptive minibatch size mode.
+ Default value is 500.
+ It's typically set to 10-20% of the total minibatches in an epoch
+this is shared with the search for learning rate in 
+SearchBeforeEpoch mode.
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+autoAdjustMinibatch
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+autoAdjustMinibatch
+\end_layout
+
+\end_inset
+
+: enable or disable whether minibatch size is adaptively adjusted.
+ Default value is false.
+Adapative minibatch sizing will begin on 
+epochs starting after user minbatch sizes expcitily
+specified are complete.  For example if the user
+specifed minibatchSize=256:1024, then 256 and 1024
+are used in the first 2 Epochs and adaptive minibatch
+sizing is used aferwards
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+minibatchSizeTuningFrequency
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+minibatchSizeTuningFrequency
+\end_layout
+
+\end_inset
+
+: The number of epochs to skip, on a periodic basis, before
+dynamically adjusting the minibatch size.
+ Default value is 1.
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+minibatchSizeTuningMax
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+minibatchSizeTuningMax
+\end_layout
+
+\end_inset
+
+: The maximum size allowed for an
+adaptively adjusted minibatch size.
+ Default value is 1048576.
+ 
+\end_layout
+
 \end_deeper
 \begin_layout Subsubsection
 Gradient control
diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index b91352f452d8..266b8331b929 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -126,7 +126,7 @@ typedef struct stGradientUpdateInfo
 } GradientUpdateInfo;
 
 template<class ElemType>
-class SGD: ComputationNetworkHelper<ElemType>
+class SGD : ComputationNetworkHelper<ElemType>
 {
 protected:
     typedef ComputationNetworkHelper<ElemType> B;
@@ -157,17 +157,31 @@ class SGD: ComputationNetworkHelper<ElemType>
         ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618");
         ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF");
         ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382");
+
+        // AutoAdjust Auto Adjust Minibatch Parameters
+        bool autoAdjustMinibatch = (bool) configAALR("autoAdjustMinibatch", "false");
+        size_t minibatchSizeTuningFrequency = configAALR("minibatchSizeTuningFrequency", "1");
+        size_t minibatchSizeTuningMax = configAALR("minibatchSizeTuningMax", "1048576");
+
+        // the number of minibatches used to search
+        // the learning rate. It’s typically set to 10-20% of
+        // the total minibatches in an epoch.
         ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500");
         intargvector numMiniBatch4LRSearch = minibatch4LRSearch;
+
         size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5");
         size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1");
         bool loadBestModel = configAALR("loadBestModel", "true");
 
         ConfigArray minibatchSize = configSGD("minibatchSize", "256");
         intargvector mbSize = minibatchSize;
+
+        // the number of samples in each epoch (0 means, use all the samples in each epoch).
         size_t epochSize = configSGD("epochSize", "0");
 
+        // the total number of epochs to run.
         size_t maxEpochs = configSGD("maxEpochs");
+
         ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
         floatargvector momentumPerMB = momentumPerMBStr;
 
@@ -240,7 +254,8 @@ class SGD: ComputationNetworkHelper<ElemType>
              trainCriterionNodeName, evalCriterionNodeName, doGradientCheck,
              gradientCheckSigDigit, validateAfterModelReloading, rpi,
              learnRateAdjustInterval, UsingAllDataForPreComputedNode,
-             needAveMultiplier, L2RegWeight, L1RegWeight);
+             needAveMultiplier, L2RegWeight, L1RegWeight,
+             autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax);
     }
 
     void setMomentum(float momentum)
@@ -287,15 +302,27 @@ class SGD: ComputationNetworkHelper<ElemType>
               const bool UsingAllDataForPreComputed = true,
               const bool needAveMultiplier = true,
               const ElemType L2RegWeight = 0,
-              const ElemType L1RegWeight = 0)
+              const ElemType L1RegWeight = 0,
+              const bool autoAdjustMinibatch = false,
+              const size_t minibatchSizeTuningFrequency = 1,
+              const size_t minibatchSizeTuningMax = 1048576)
     {
         m_numPrevLearnRates = numPrevLearnRates;
+        m_prevChosenMinibatchSize = 0;
+        m_autoAdjustMinibatch = autoAdjustMinibatch;
+        m_minibatchSizeTuningMax = minibatchSizeTuningMax;
+        m_minibatchSizeTuningFrequency = minibatchSizeTuningFrequency;
+
         m_mbSize = mbSize;
+
+        // the number of samples in each epoch (0 means, use all the samples in each epoch).
         m_epochSize = epochSize;
         if (m_epochSize == 0)
         {
             m_epochSize = requestDataSize;
         }
+
+        // the total number of epochs to run.
         m_maxEpochs = maxEpochs;
 
         m_gradientClippingWithTruncation = gradientClippingWithTruncation;
@@ -346,7 +373,8 @@ class SGD: ComputationNetworkHelper<ElemType>
             (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0))
         {
             throw std::invalid_argument(
-                "If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter.");
+                "If autoLearnRateSearchType is false you must specify the "
+                "learningRatesPerSample or learningRatesPerMB parameter.");
         }
 
         if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0)
@@ -368,6 +396,7 @@ class SGD: ComputationNetworkHelper<ElemType>
             }
             m_needToNormalizeLRByParallUtterance = true;
         }
+
         m_momentumPerMB = 0.9f;
         if (momentumPerMB.size() > 0)
         {
@@ -550,8 +579,8 @@ class SGD: ComputationNetworkHelper<ElemType>
                            IDataReader<ElemType>* trainSetDataReader,
                            IDataReader<ElemType>* validationSetDataReader)
     {
-        std::vector<ComputationNodePtr> & FeatureNodes = net.FeatureNodes();
-        std::vector<ComputationNodePtr> & labelNodes = net.LabelNodes();
+        std::vector<ComputationNodePtr>& FeatureNodes = net.FeatureNodes();
+        std::vector<ComputationNodePtr>& labelNodes = net.LabelNodes();
         std::vector<ComputationNodePtr> criterionNodes = GetTrainCriterionNodes(net);
         std::vector<ComputationNodePtr> evaluationNodes = GetEvalCriterionNodes(net);
 
@@ -610,6 +639,7 @@ class SGD: ComputationNetworkHelper<ElemType>
         size_t totalSamplesSeen = 0;
         ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
 
+        float learningRateAdjustmentFactor = 1.0f;
         vector<ElemType> prevLearnRates;
         prevLearnRates.resize(m_numPrevLearnRates);
         for (int i = 0; i < m_numPrevLearnRates; i++)
@@ -640,7 +670,7 @@ class SGD: ComputationNetworkHelper<ElemType>
         if (startEpoch > 0)
         {
             learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen,
-                                                      learnRatePerSample, smoothedGradients, prevCriterion);
+                                                      learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
             if (learnRateInitialized)
             {
                 prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
@@ -653,7 +683,8 @@ class SGD: ComputationNetworkHelper<ElemType>
             !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
         {
             throw std::invalid_argument(
-                "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");
+                "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, "
+                "or an explicit learning rate must be specified in config for the starting epoch.");
         }
 
         unsigned long dropOutSeed = 1;
@@ -667,17 +698,18 @@ class SGD: ComputationNetworkHelper<ElemType>
             SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN);
         }
 
-        for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
+        for (int i = startEpoch; i < (int) m_maxEpochs; i++)
         {
             auto t_start_epoch = Timer::MilliSecondElapsed();
 
-            //set dropout rate
+            // set dropout rate
             SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
 
             setMomentum(m_momentumInputPerMB[i]);
 
-            //learning rate adjustment
-            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
+            // learning rate adjustment
+            if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None ||
+                (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
             {
                 learnRatePerSample = m_learningRatesPerSample[i];
             }
@@ -689,14 +721,16 @@ class SGD: ComputationNetworkHelper<ElemType>
                     largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]);
                 }
 
-                //return a reasonable  learning rate based on the initial mbsize
-                learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample,
-                                                                trainSetDataReader, FeatureNodes, labelNodes,
-                                                                criterionNodes, evaluationNodes, inputMatrices,
-                                                                learnableNodes, smoothedGradients, learnRateInitialized,
-                                                                largestPrevLearnRatePerSample);
+                // return a reasonable learning rate based on the initial minibatchSize
+                ElemType newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample,
+                                                                           trainSetDataReader, FeatureNodes, labelNodes,
+                                                                           criterionNodes, evaluationNodes, inputMatrices,
+                                                                           learnableNodes, smoothedGradients,
+                                                                           learnRateInitialized, largestPrevLearnRatePerSample);
+                learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample;
+                learnRatePerSample = newLearningRatePerSample;
 
-                //save per sample learn rate to support changeable mbsize
+                // save per sample learn rate to support changeable minibatchSize
                 prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample;
             }
 
@@ -704,8 +738,7 @@ class SGD: ComputationNetworkHelper<ElemType>
 
             if (learnRatePerSample < m_minLearnRate)
             {
-                fprintf(stderr,
-                        "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n",
+                fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n",
                         i + 1, learnRatePerSample, m_minLearnRate);
                 if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None)
                 {
@@ -718,10 +751,40 @@ class SGD: ComputationNetworkHelper<ElemType>
             INT32 mySamples = (INT32)
 #endif
             fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
-                    i, learnRatePerSample, m_momentumPerMB);
+                    i + 1, learnRatePerSample, m_momentumPerMB);
+
+            size_t chosenMinibatchSize;
+
+            // Through the command line or config file the user can set minibatch sizes on a per epoch
+            // basis for a set number of epochs.  For epochs after that point, m_mbSize.size(), either
+            // we just keep using
+            // the last minibatch size, or we use tuning to try and find a better one.
+            if (m_autoAdjustMinibatch && i >= m_mbSize.size())
+            {
+                size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i];
+                if (m_epochSize != requestDataSize)
+                {
+                    // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+                    numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
+                }
+
+                // Use tuning to try and find a better minibatch size
+                chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i,
+                                                              numFramesToUseInSearch,
+                                                              trainSetDataReader, learnRatePerSample,
+                                                              m_mbSize[i], FeatureNodes, labelNodes,
+                                                              criterionNodes, evaluationNodes,
+                                                              inputMatrices, learnableNodes,
+                                                              smoothedGradients, learningRateAdjustmentFactor);
+            }
+            else
+            {
+                // use the explicitly set minibatch size
+                chosenMinibatchSize = m_mbSize[i];
+            }
 
             TrainOneEpoch(net, refNet, refNode, i, m_epochSize,
-                          trainSetDataReader, learnRatePerSample, FeatureNodes,
+                          trainSetDataReader, learnRatePerSample, chosenMinibatchSize, FeatureNodes,
                           labelNodes, criterionNodes, evaluationNodes,
                           inputMatrices, learnableNodes, smoothedGradients,
                           epochCriterion, epochEvalErrors, totalSamplesSeen);
@@ -730,23 +793,23 @@ class SGD: ComputationNetworkHelper<ElemType>
             ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC);
 
             fprintf(stderr,
-                    "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g    ",
+                    "Finished Epoch[%d]: [Training Set] TrainLossPerSample = %.8g; ",
                     i + 1, epochCriterion);
             if (epochEvalErrors.size() == 1)
             {
                 fprintf(stderr,
-                        "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n",
+                        "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n",
                         epochEvalErrors[0], learnRatePerSample, epochTime);
             }
             else
             {
-                fprintf(stderr, "EvalErr Per Sample ");
+                fprintf(stderr, "EvalErrPerSample ");
                 for (size_t j = 0; j < epochEvalErrors.size(); j++)
                 {
-                    fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
+                    fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]);
                 }
 
-                fprintf(stderr, "Ave Learn Rate Per Sample = %.10g  Epoch Time=%.8g\n",
+                fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n",
                         learnRatePerSample, epochTime);
                 fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n",
                         i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion);
@@ -810,7 +873,7 @@ class SGD: ComputationNetworkHelper<ElemType>
                     cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName());
 
                     vector<ElemType> vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]);
-                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g  EvalErr Per Sample = %.8g\n",
+                    fprintf(stderr, "Finished Epoch[%d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n",
                             i + 1, vScore[0], vScore[1]);
 
                     epochCriterion = vScore[0]; //the first one is the training criterion.
@@ -847,7 +910,7 @@ class SGD: ComputationNetworkHelper<ElemType>
                         net.ResetEvalTimeStamp();
                         LoadCheckPointInfo(i - 1, totalSamplesSeen,
                                            learnRatePerSample,
-                                           smoothedGradients, prevCriterion);
+                                           smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
                         fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
                         loadedPrevModel = true;
                     }
@@ -911,7 +974,7 @@ class SGD: ComputationNetworkHelper<ElemType>
             if (mpiRank == 0)
             {
                 net.SaveToFile(GetModelNameForEpoch(i));
-                SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion);
+                SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize);
                 if (!m_keepCheckPointFiles)
                 {
                     //delete previous checkpiont file to save space
@@ -919,7 +982,8 @@ class SGD: ComputationNetworkHelper<ElemType>
                 }
             }
 
-            if (learnRatePerSample < 1e-12) {
+            if (learnRatePerSample < 1e-12)
+            {
                 fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n",
                         learnRatePerSample);
             }
@@ -938,8 +1002,7 @@ class SGD: ComputationNetworkHelper<ElemType>
     }
 
 protected:
-
-    //return true if precomputation is executed.
+    // return true if precomputation is executed.
     bool PreCompute(ComputationNetwork<ElemType>& net,
                     IDataReader<ElemType>* trainSetDataReader,
                     std::vector<ComputationNodePtr>& FeatureNodes,
@@ -965,7 +1028,8 @@ class SGD: ComputationNetworkHelper<ElemType>
         //trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , requestDataSize);
         // trainSetDataReader->StartMinibatchLoop(m_mbSize[0],  0 , m_epochSize); // only based on one epoch
         // [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing
-        if (m_useAllDataForPreComputedNode) {
+        if (m_useAllDataForPreComputedNode)
+        {
             // using all the data
             trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
         }
@@ -1000,21 +1064,21 @@ class SGD: ComputationNetworkHelper<ElemType>
         return true;
     }
 
-    //return a reasonable initial learning rate based on the initial mbsize
-    ElemType SearchLearnRateBeforeEpoch(ComputationNetwork<ElemType>& net,
-                                        ComputationNetwork<ElemType>& refNet,
-                                        const ComputationNodePtr refNode, const int epochNumber,
-                                        const ElemType curLearnRate,
-                                        IDataReader<ElemType>* trainSetDataReader,
-                                        const std::vector<ComputationNodePtr>& FeatureNodes,
-                                        const std::vector<ComputationNodePtr>& labelNodes,
-                                        const std::vector<ComputationNodePtr>& criterionNodes,
-                                        const std::vector<ComputationNodePtr>& evaluationNodes,
-                                        std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
-                                        const std::list<ComputationNodePtr>& learnableNodes,
-                                        std::list<Matrix<ElemType>>& smoothedGradients,
-                                        const bool learnRateInitialized,
-                                        const ElemType largestPrevLearnRatePerSample)
+    // return a reasonable initial learning rate based on the initial mbsize
+    ElemType SearchForBestLearnRate(ComputationNetwork<ElemType>& net,
+                                    ComputationNetwork<ElemType>& refNet,
+                                    const ComputationNodePtr refNode, const int epochNumber,
+                                    const ElemType curLearnRate,
+                                    IDataReader<ElemType>* trainSetDataReader,
+                                    const std::vector<ComputationNodePtr>& FeatureNodes,
+                                    const std::vector<ComputationNodePtr>& labelNodes,
+                                    const std::vector<ComputationNodePtr>& criterionNodes,
+                                    const std::vector<ComputationNodePtr>& evaluationNodes,
+                                    std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+                                    const std::list<ComputationNodePtr>& learnableNodes,
+                                    std::list<Matrix<ElemType>>& smoothedGradients,
+                                    const bool learnRateInitialized,
+                                    const ElemType largestPrevLearnRatePerSample)
     {
         ElemType epochCriterion = std::numeric_limits<ElemType>::infinity();
         ElemType prevCriterion = std::numeric_limits<ElemType>::infinity();
@@ -1023,10 +1087,11 @@ class SGD: ComputationNetworkHelper<ElemType>
         size_t totalSamplesSeen = 0;
         ElemType bestLearnRatePerSample = curLearnRate;
 
-        size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
+        size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber];
         if (m_epochSize != requestDataSize)
         {
-            epochSize = min(epochSize, m_epochSize); //use a small number minibatches to make decision
+            // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch
+            numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize);
         }
 
         ElemType baseCriterion;
@@ -1045,17 +1110,18 @@ class SGD: ComputationNetworkHelper<ElemType>
         net.ResetEvalTimeStamp();
 
         ElemType learnRate = learnRatePerSample;
+        size_t dummyMinibatchSize = 0;
         LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate,
-                           smoothedGradients, prevCriterion);
+                           smoothedGradients, prevCriterion, dummyMinibatchSize);
 
-        //if model is not changed this is what we will get
+        // if model is not changed this is what we will get
         TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                        epochSize, trainSetDataReader, 0,
+                                        numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber],
                                         FeatureNodes, labelNodes,
                                         criterionNodes, evaluationNodes,
                                         inputMatrices, learnableNodes,
                                         smoothedGradients, baseCriterion,
-                                        epochEvalErrors, totalSamplesSeen);
+                                        epochEvalErrors, totalSamplesSeen, "BaseAdaptiveLearnRateSearch:");
 
         if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
         {
@@ -1068,7 +1134,7 @@ class SGD: ComputationNetworkHelper<ElemType>
 
             if (m_epochSize != requestDataSize)
             {
-                ratio = pow(((ElemType) epochSize) / m_epochSize, 1.0f / 2);
+                ratio = pow(((ElemType) numFramesToUseInSearch) / m_epochSize, 1.0f / 2);
             }
 
             baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion);
@@ -1078,13 +1144,13 @@ class SGD: ComputationNetworkHelper<ElemType>
         {
             learnRatePerSample *= 0.618f;
             TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            epochSize, trainSetDataReader,
-                                            learnRatePerSample, FeatureNodes,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            learnRatePerSample, m_mbSize[epochNumber], FeatureNodes,
                                             labelNodes, criterionNodes,
                                             evaluationNodes, inputMatrices,
                                             learnableNodes, smoothedGradients,
                                             epochCriterion, epochEvalErrors,
-                                            totalSamplesSeen);
+                                            totalSamplesSeen, "AdaptiveLearnRateSearch:");
 
         }
         while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
@@ -1099,13 +1165,13 @@ class SGD: ComputationNetworkHelper<ElemType>
             ElemType leftCriterion, rightCriterion = epochCriterion;
 
             TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
-                                            epochSize, trainSetDataReader,
-                                            leftLearnRatePerSample,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            leftLearnRatePerSample, m_mbSize[epochNumber],
                                             FeatureNodes, labelNodes,
                                             criterionNodes, evaluationNodes,
                                             inputMatrices, learnableNodes,
                                             smoothedGradients, leftCriterion,
-                                            epochEvalErrors, totalSamplesSeen);
+                                            epochEvalErrors, totalSamplesSeen, "DetailBaseAdaptiveLearnRateSearch:");
 
             while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f)
             {
@@ -1114,9 +1180,9 @@ class SGD: ComputationNetworkHelper<ElemType>
                     rightLearnRatePerSample *= 0.618f;
 
                     TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, epochSize,
+                                                    epochNumber, numFramesToUseInSearch,
                                                     trainSetDataReader,
-                                                    rightLearnRatePerSample,
+                                                    rightLearnRatePerSample, m_mbSize[epochNumber],
                                                     FeatureNodes, labelNodes,
                                                     criterionNodes,
                                                     evaluationNodes,
@@ -1125,16 +1191,16 @@ class SGD: ComputationNetworkHelper<ElemType>
                                                     smoothedGradients,
                                                     rightCriterion,
                                                     epochEvalErrors,
-                                                    totalSamplesSeen);
+                                                    totalSamplesSeen, "DetailRightAdaptiveLearnRateSearch:");
                 }
                 else
                 {
                     leftLearnRatePerSample /= 0.618f;
 
                     TrainOneMiniEpochAndReloadModel(net, refNet, refNode,
-                                                    epochNumber, epochSize,
+                                                    epochNumber, numFramesToUseInSearch,
                                                     trainSetDataReader,
-                                                    leftLearnRatePerSample,
+                                                    leftLearnRatePerSample, m_mbSize[epochNumber],
                                                     FeatureNodes, labelNodes,
                                                     criterionNodes,
                                                     evaluationNodes,
@@ -1143,7 +1209,7 @@ class SGD: ComputationNetworkHelper<ElemType>
                                                     smoothedGradients,
                                                     leftCriterion,
                                                     epochEvalErrors,
-                                                    totalSamplesSeen);
+                                                    totalSamplesSeen, "DetailLeftAdaptiveLearnRateSearch:");
                 }
             }
 
@@ -1162,55 +1228,252 @@ class SGD: ComputationNetworkHelper<ElemType>
                                          const ComputationNodePtr refNode, const int epochNumber,
                                          const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
                                          const ElemType learnRatePerSample,
+                                         const size_t minibatchSize,
                                          const std::vector<ComputationNodePtr>& FeatureNodes,
                                          const std::vector<ComputationNodePtr>& labelNodes,
                                          const std::vector<ComputationNodePtr>& criterionNodes,
                                          const std::vector<ComputationNodePtr>& evaluationNodes,
                                          std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
                                          const std::list<ComputationNodePtr>& learnableNodes,
-                                         std::list<Matrix<ElemType>>& smoothedGradients,
-                                         ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors,
-                                         size_t& totalSamplesSeen)
+                                         /*out*/ std::list<Matrix<ElemType>>& smoothedGradients,
+                                         /*out*/ ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors,
+                                         /*out*/ size_t& totalSamplesSeen,
+                                         std::string prefixMsg = "")
     {
         TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize,
-                      trainSetDataReader, learnRatePerSample, FeatureNodes,
+                      trainSetDataReader, learnRatePerSample, minibatchSize, FeatureNodes,
                       labelNodes, criterionNodes, evaluationNodes,
                       inputMatrices, learnableNodes, smoothedGradients,
-                      epochCriterion, epochEvalErrors, totalSamplesSeen);
-        fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g    ",
+                      /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                      prefixMsg);
+
+        fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;",
                 epochCriterion);
 
         if (epochEvalErrors.size() == 1)
         {
-            fprintf(stderr, "EvalErr Per Sample = %.8g   Ave Learn Rate Per Sample = %.10g\n",
+            fprintf(stderr, "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g\n",
                     epochEvalErrors[0], learnRatePerSample);
         }
         else
         {
-            fprintf(stderr, "EvalErr Per Sample ");
+            fprintf(stderr, "EvalErrPerSample ");
             for (size_t i = 0; i < epochEvalErrors.size(); i++)
             {
-                fprintf(stderr, "[%lu] = %.8g ", i, epochEvalErrors[i]);
+                fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]);
             }
 
-            fprintf(stderr, "Ave Learn Rate Per Sample = %.10g\n", learnRatePerSample);
+            fprintf(stderr, "Ave LearnRatePerSample = %.10g\n", learnRatePerSample);
         }
 
         int baseModelEpoch = epochNumber - 1;
         net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading);
         net.ResetEvalTimeStamp();
 
-        ElemType learnRate;
-        ElemType prevCriterion;
-        LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate,
-                           smoothedGradients, prevCriterion);
+        ElemType dummyLearnRate;
+        ElemType dummtPrevCriterion;
+        size_t dummyMinibatchSize = 0;
+        LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, dummyLearnRate,
+                           smoothedGradients, dummtPrevCriterion, dummyMinibatchSize);
+    }
+
+    size_t AdaptiveMinibatchSizing(ComputationNetwork<ElemType>& net,
+                                   ComputationNetwork<ElemType>& refNet,
+                                   const ComputationNodePtr refNode,
+                                   const int epochNumber,
+                                   const size_t numFramesToUseInSearch,
+                                   const IDataReader<ElemType>* trainSetDataReader,
+                                   const ElemType learnRatePerSample,
+                                   const size_t initialMinibatchSize,
+                                   const std::vector<ComputationNodePtr>& FeatureNodes,
+                                   const std::vector<ComputationNodePtr>& labelNodes,
+                                   const std::vector<ComputationNodePtr>& criterionNodes,
+                                   const std::vector<ComputationNodePtr>& evaluationNodes,
+                                   std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+                                   const std::list<ComputationNodePtr>& learnableNodes,
+                                   std::list<Matrix<ElemType>>& smoothedGradients,
+                                   const float learningRateAdjustmentFactor)
+    {
+        size_t minMinibatchSize = initialMinibatchSize;
+        size_t chosenMinibatchSize = initialMinibatchSize;
+
+        // do some pre-adjustment based on LR
+        // Basically we assume that the LR for epoch 1 is safe for mbsize.
+        // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
+        float learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0];
+        learningRateChangeSoFar *= learningRateAdjustmentFactor;
+
+        // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
+        learningRateChangeSoFar = sqrt(learningRateChangeSoFar);
+
+        // LR was indeed reduced
+        if (learningRateChangeSoFar < 1.0f)
+        {
+            // we can safely increase MB size (note: this may be bigger than our max)
+            minMinibatchSize = (size_t) (minMinibatchSize / learningRateChangeSoFar);
+        }
+
+        if (epochNumber < 2 && m_prevChosenMinibatchSize != 0)
+        {
+            // newly started training: any previous MB size stored in the model is to be ignored
+            fprintf (stderr, "before epoch .2, previous minibatchSize %d is "
+                     "considered invalid -> resetting\n", m_prevChosenMinibatchSize);
+            m_prevChosenMinibatchSize = 0;
+        }
+
+        // check if we need to skip
+        if (m_prevChosenMinibatchSize != 0 &&
+            (epochNumber + 1) > m_minibatchSizeTuningFrequency &&
+            (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0)
+        {
+            fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize "
+                    "in epoch %d skipped, keeping minibatchSize of %d\n",
+                    epochNumber + 1, m_prevChosenMinibatchSize);
+            chosenMinibatchSize = m_prevChosenMinibatchSize;
+        }
+        else
+        {
+            if (m_prevChosenMinibatchSize != 0)
+            {
+                // but we don't go lower than 0.5 * the chosen previous one
+                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to "
+                        "previous minibatchSize = (%d / 2)\n", m_prevChosenMinibatchSize);
+                minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2);
+            }
+
+            size_t maxMinibatchSize = m_minibatchSizeTuningMax;
+
+            // only grow at most 2 x compared to previous step
+            if (m_prevChosenMinibatchSize != 0.0f)
+            {
+                if (m_prevChosenMinibatchSize < chosenMinibatchSize)
+                {
+                    m_prevChosenMinibatchSize = chosenMinibatchSize;
+                }
+
+                fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to "
+                        "previous minibatchSize %d*2\n", m_prevChosenMinibatchSize);
+                maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2);
+            }
+
+            chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber,
+                                                             numFramesToUseInSearch, trainSetDataReader,
+                                                             learnRatePerSample, FeatureNodes,
+                                                             labelNodes, criterionNodes,
+                                                             evaluationNodes, inputMatrices,
+                                                             learnableNodes, smoothedGradients,
+                                                             minMinibatchSize, maxMinibatchSize);
+        }
+
+        return chosenMinibatchSize;
+    }
+
+    size_t RoundToMultipleOf64(float val)
+    {
+        return 64 * (size_t) ((val + 32) / 64);
+    }
+
+    // uses a small percentage of training data of minibatch to
+    // speculatively train with various MB sizes; then picks the best
+    size_t SearchForBestMinibatchSize(ComputationNetwork<ElemType>& net,
+                                      ComputationNetwork<ElemType>& refNet,
+                                      const ComputationNodePtr refNode,
+                                      const int epochNumber,
+                                      const size_t numFramesToUseInSearch,
+                                      IDataReader<ElemType>* trainSetDataReader,
+                                      const ElemType learnRatePerSample,
+                                      const std::vector<ComputationNodePtr>& FeatureNodes,
+                                      const std::vector<ComputationNodePtr>& labelNodes,
+                                      const std::vector<ComputationNodePtr>& criterionNodes,
+                                      const std::vector<ComputationNodePtr>& evaluationNodes,
+                                      std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
+                                      const std::list<ComputationNodePtr>& learnableNodes,
+                                      std::list<Matrix<ElemType>>& smoothedGradients,
+                                      const size_t minMinibatchSize, const size_t maxMinibatchSize)
+    {
+        // may happen for automatically reduced learning rates
+        if (minMinibatchSize > maxMinibatchSize)
+        {
+            return maxMinibatchSize;
+        }
+
+        size_t trialMinibatchSize = 0;
+        bool isFirstIteration = true;
+        ElemType baseCriterion;
+
+        // increase the minibatch size by a factor of sqrt(2) in each step.
+        const float minibatchSizeTuningFactor = sqrtf(2.0f);
+
+        size_t lastTriedtrialMinibatchSize = -1;
+        for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
+             trialMinibatchSizeFloat <= maxMinibatchSize;
+             trialMinibatchSizeFloat *= minibatchSizeTuningFactor)
+        {
+            // round mbsize to something meaningful
+            trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat);
+
+            fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%d out of range %d..%d ...\n\n",
+                    trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize));
+
+            size_t totalSamplesSeen;
+            std::vector<ElemType> epochEvalErrors(evaluationNodes.size(), std::numeric_limits<ElemType>::infinity());
+            ElemType epochCriterion = std::numeric_limits<ElemType>::infinity();
+
+            // Train on a few minibatches and so we can observe the epochCriterion as we try increasing
+            // minibatches with iteration of this loop.
+            TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
+                                            numFramesToUseInSearch, trainSetDataReader,
+                                            learnRatePerSample, trialMinibatchSize, FeatureNodes,
+                                            labelNodes, criterionNodes,
+                                            evaluationNodes, inputMatrices,
+                                            learnableNodes, smoothedGradients,
+                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                            /*out*/ totalSamplesSeen,
+                                            isFirstIteration ? "BaseAdaptiveMinibatchSearch:" :
+                                                               "AdaptiveMinibatchSearch:");
+
+            if (isFirstIteration)
+            {
+                // for the first iteration of the loop only, set baseCriterion
+                // to the result we got from TrainOneMiniEpochAndReloadModel().
+                baseCriterion = epochCriterion;
+                lastTriedtrialMinibatchSize = trialMinibatchSize;
+                isFirstIteration = false;
+
+                fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion);
+            }
+            else if (!std::isnan(epochCriterion) && (epochCriterion > baseCriterion))
+            {
+                fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Choose new minibatchSize of %d.  "
+                        "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n",
+                        lastTriedtrialMinibatchSize, epochCriterion, baseCriterion);
+
+                // As soon as we see the Criterion (a measure of error) start to get larger than the
+                // Criterion we started with, we stop.
+                // TODO: if this is too sensitive, we can add a margin on the bases of percentage of
+                // baseCriterion.
+                break;
+            }
+            else
+            {
+                lastTriedtrialMinibatchSize = trialMinibatchSize;
+                fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... "
+                        "EpochCriterion = %.10g vs BaseCriterion = %.10g\n",
+                        epochCriterion, baseCriterion);
+            }
+        }
+
+        return lastTriedtrialMinibatchSize;
     }
 
     size_t TrainOneEpoch(ComputationNetwork<ElemType>& net,
                          ComputationNetwork<ElemType>& refNet,
-                         const ComputationNodePtr refNode, const int epochNumber,
-                         const size_t epochSize, IDataReader<ElemType>* trainSetDataReader,
+                         const ComputationNodePtr refNode,
+                         const int epochNumber,
+                         const size_t epochSize,
+                         IDataReader<ElemType>* trainSetDataReader,
                          const ElemType learnRatePerSample,
+                         size_t tunedMBSize,
                          const std::vector<ComputationNodePtr>& FeatureNodes,
                          const std::vector<ComputationNodePtr>& labelNodes,
                          const std::vector<ComputationNodePtr>& criterionNodes,
@@ -1218,8 +1481,10 @@ class SGD: ComputationNetworkHelper<ElemType>
                          std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
                          const std::list<ComputationNodePtr>& learnableNodes,
                          std::list<Matrix<ElemType>>& smoothedGradients,
-                         ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors,
-                         size_t& totalSamplesSeen)
+                         /*out*/ ElemType& epochCriterion,
+                         /*out*/ std::vector<ElemType>& epochEvalErrors,
+                         /*out*/ size_t& totalSamplesSeen,
+                         std::string prefixMsg = "")
     {
         ElemType readTimeInMBs = 0;
         ElemType ComputeTimeInMBs = 0;
@@ -1239,6 +1504,7 @@ class SGD: ComputationNetworkHelper<ElemType>
         size_t numEvalNodes = epochEvalErrors.size();
 
         //assume only one training criterion node for each epoch
+
         Matrix<ElemType> localEpochCriterion(1, 1, net.GetDeviceID());
         Matrix<ElemType> localEpochEvalErrors(1, numEvalNodes, net.GetDeviceID());
 
@@ -1249,7 +1515,7 @@ class SGD: ComputationNetworkHelper<ElemType>
         // resetting this, so profiling is performed for one epoch only
         m_numMBsToCUDAProfile = 0;
 
-        trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize);
+        trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
 
         startReadMBTime = Timer::MilliSecondElapsed();
         while (trainSetDataReader->GetMinibatch(inputMatrices))
@@ -1350,18 +1616,18 @@ class SGD: ComputationNetworkHelper<ElemType>
                         epochEvalErrors[i] = (const ElemType) localEpochEvalErrors(0, i);
                     }
 
-                    fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d    Train Loss Per Sample = %.8g    ",
-                            epochNumber + 1, numMBsRun - m_numMBsToShowResult + 1,
-                            numMBsRun, numSamplesLastMBs,
+                    fprintf(stderr, "%s Epoch[%d of %d]-Minibatch[%d-%d of %d]: SamplesSeen = %d; TrainLossPerSample = %.8g; ",
+                            prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1,
+                            numMBsRun, epochSize / tunedMBSize, numSamplesLastMBs,
                             (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs);
 
                     for (size_t i = 0; i < numEvalNodes; i++)
                     {
-                        fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g    ",
+                        fprintf(stderr, "EvalErr[%lu]PerSample = %.8g; ",
                                 i, (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs);
                     }
 
-                    fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n",
+                    fprintf(stderr, "ReadDataTime = %.8g; ComputeTime=%.8g; TotalTimePerSample=%.8g\n",
                             readTimeInMBs, ComputeTimeInMBs,
                             (readTimeInMBs + ComputeTimeInMBs) / numSamplesLastMBs);
 
@@ -1482,7 +1748,8 @@ class SGD: ComputationNetworkHelper<ElemType>
         }
 
         // L1 regularizer with proximal gradient descent method
-        if (L1RegWeight > 0) {
+        if (L1RegWeight > 0)
+        {
             //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
             functionValues.InplaceSoftThreshold(learnRatePerSample * L1RegWeight * actualMBSize);
         }
@@ -1536,7 +1803,8 @@ class SGD: ComputationNetworkHelper<ElemType>
     void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
                             const ElemType learnRatePerSample,
                             const std::list<Matrix<ElemType>>& smoothedGradients,
-                            const ElemType prevCriterion)
+                            const ElemType prevCriterion,
+                            const size_t minibatchSize)
     {
         wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
 
@@ -1548,6 +1816,10 @@ class SGD: ComputationNetworkHelper<ElemType>
         fstream << totalSamplesSeen << learnRatePerSample << prevCriterion;
         fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
 
+        fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
+        fstream << minibatchSize;
+        fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+
         fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
         for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
@@ -1561,12 +1833,13 @@ class SGD: ComputationNetworkHelper<ElemType>
         fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
     }
 
-    bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen,
+    bool LoadCheckPointInfo(const size_t epochNumber, size_t& totalSamplesSeen,
                             ElemType& learnRatePerSample,
                             std::list<Matrix<ElemType>>& smoothedGradients,
-                            ElemType& prevCriterion)
+                            ElemType& prevCriterion,
+                            size_t& minibatchSize)
     {
-        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch));
+        wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
         if (!fexists(checkPointFileName.c_str()))
         {
             fprintf(stderr,
@@ -1582,6 +1855,17 @@ class SGD: ComputationNetworkHelper<ElemType>
         fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion;
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate");
 
+        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
+        {
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
+            fstream >> minibatchSize;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
+        }
+        else
+        {
+            minibatchSize =m_mbSize[epochNumber];
+        }
+
         fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
         for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
@@ -1850,8 +2134,13 @@ class SGD: ComputationNetworkHelper<ElemType>
     bool m_needToNormalizeLRByParallUtterance;
 
     intargvector m_mbSize;
+
+    // the number of samples in each epoch (0 means, use all the samples in each epoch).
     size_t m_epochSize;
+
+    // the total number of epochs to run.
     size_t m_maxEpochs;
+
     floatargvector m_momentumInputPerMB;
     ElemType m_momentumPerMB;
     bool m_gradientClippingWithTruncation;
@@ -1880,6 +2169,10 @@ class SGD: ComputationNetworkHelper<ElemType>
     ElemType m_increaseLearnRateIfImproveMoreThan;
     ElemType m_learnRateIncreaseFactor;
     ElemType m_learnRateDecreaseFactor;
+    size_t m_prevChosenMinibatchSize;
+    bool m_autoAdjustMinibatch;
+    size_t m_minibatchSizeTuningFrequency;
+    size_t m_minibatchSizeTuningMax;
 
     floatargvector m_dropoutRates;
     size_t m_maxTempMemSizeInSamplesForCNN;

From cac315623bdea2dfeab42d43f5bafa6a61a8356c Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Mon, 6 Jul 2015 16:16:54 -0700
Subject: [PATCH 2/6] Fixed Windows build

---
 MachineLearning/CNTK/SGD.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index 266b8331b929..fc2eb0cf1c4c 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -639,7 +639,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         size_t totalSamplesSeen = 0;
         ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch];
 
-        float learningRateAdjustmentFactor = 1.0f;
+        ElemType learningRateAdjustmentFactor = 1.0f;
         vector<ElemType> prevLearnRates;
         prevLearnRates.resize(m_numPrevLearnRates);
         for (int i = 0; i < m_numPrevLearnRates; i++)
@@ -1282,7 +1282,7 @@ class SGD : ComputationNetworkHelper<ElemType>
                                    const ComputationNodePtr refNode,
                                    const int epochNumber,
                                    const size_t numFramesToUseInSearch,
-                                   const IDataReader<ElemType>* trainSetDataReader,
+                                   IDataReader<ElemType>* trainSetDataReader,
                                    const ElemType learnRatePerSample,
                                    const size_t initialMinibatchSize,
                                    const std::vector<ComputationNodePtr>& FeatureNodes,
@@ -1292,7 +1292,7 @@ class SGD : ComputationNetworkHelper<ElemType>
                                    std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
                                    const std::list<ComputationNodePtr>& learnableNodes,
                                    std::list<Matrix<ElemType>>& smoothedGradients,
-                                   const float learningRateAdjustmentFactor)
+                                   const ElemType learningRateAdjustmentFactor)
     {
         size_t minMinibatchSize = initialMinibatchSize;
         size_t chosenMinibatchSize = initialMinibatchSize;
@@ -1300,7 +1300,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         // do some pre-adjustment based on LR
         // Basically we assume that the LR for epoch 1 is safe for mbsize.
         // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size.
-        float learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0];
+        ElemType learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0];
         learningRateChangeSoFar *= learningRateAdjustmentFactor;
 
         // increasing by the full factor is found to be too aggressive; sqrt() seems more robust
@@ -1373,6 +1373,11 @@ class SGD : ComputationNetworkHelper<ElemType>
         return 64 * (size_t) ((val + 32) / 64);
     }
 
+    size_t RoundToMultipleOf64(size_t val)
+    {
+        return 64 * ((val + 32) / 64);
+    }
+
     // uses a small percentage of training data of minibatch to
     // speculatively train with various MB sizes; then picks the best
     size_t SearchForBestMinibatchSize(ComputationNetwork<ElemType>& net,
@@ -1399,12 +1404,12 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         size_t trialMinibatchSize = 0;
         bool isFirstIteration = true;
-        ElemType baseCriterion;
+        ElemType baseCriterion = 0;
 
         // increase the minibatch size by a factor of sqrt(2) in each step.
         const float minibatchSizeTuningFactor = sqrtf(2.0f);
 
-        size_t lastTriedtrialMinibatchSize = -1;
+        size_t lastTriedtrialMinibatchSize = 0;
         for (float trialMinibatchSizeFloat = (float) minMinibatchSize;
              trialMinibatchSizeFloat <= maxMinibatchSize;
              trialMinibatchSizeFloat *= minibatchSizeTuningFactor)

From 54c1ac43491dfe2432703b675cec4a881c9de9e8 Mon Sep 17 00:00:00 2001
From: Yinggong ZHAO <yinggong.zhao@gmail.com>
Date: Mon, 6 Jul 2015 18:30:33 -0700
Subject: [PATCH 3/6] Update softmax GPU NCE training

---
 MachineLearning/CNTK/SGD.h |  2 ++
 Math/Math/CPUMatrix.cpp    |  5 +++++
 Math/Math/CPUMatrix.h      |  2 ++
 Math/Math/GPUMatrix.cu     |  6 +++++-
 Math/Math/Matrix.cpp       | 12 ++++++++++++
 Math/Math/Matrix.h         |  2 +-
 6 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index b91352f452d8..589e73370900 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -1252,6 +1252,8 @@ class SGD: ComputationNetworkHelper<ElemType>
         trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize);
 
         startReadMBTime = Timer::MilliSecondElapsed();
+        int a = 0;
+        if (a)
         while (trainSetDataReader->GetMinibatch(inputMatrices))
         {
 #ifdef MPI_SUPPORT
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 03e87cf85c81..f36fbe0e3dc6 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -3841,6 +3841,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
     }
+    template<class ElemType>
+    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax)
+    {
+
+    }
 
     template<class ElemType>
     void CPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index 3cbdc593eb25..874b8b93f177 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -217,6 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
             CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
+        
+        void AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax);
 
         void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
             const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index 8cf6f536d767..a486eddfc278 100755
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -1929,7 +1929,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync) CUDA_CALL(cudaEventDestroy(done));
     }
-    
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax)
+    {
+
+    }
     template<class ElemType>
     void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
     {
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index c58862175694..fd42fae109d5 100755
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -3623,6 +3623,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         return *this;
     }
+
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax)
+    {
+        this->Resize(1, 1);
+        if (this->GetDeviceId() < 0)
+            a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix);
+        else
+            a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix);
+        return *this;
+    }
+
     template<class ElemType>
     Matrix<ElemType>& Matrix<ElemType>::AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias)
     {
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 9476263228f2..8875fa9160f4 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -147,7 +147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType>& AssignNoiseContrastiveEstimation(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias, Matrix<ElemType>& tmp);
 
         Matrix<ElemType>& AssignNCEDerivative(const Matrix<ElemType>& tmp, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, size_t inputIndex);
-
+        Matrix<ElemType>& AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax);
         Matrix<ElemType>& AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias);
 
         Matrix<ElemType> Transpose(); // This method doesn't change state of Matrix. It should be a const function

From 204b879dfe6fb178cdee45e516fe0ed7dea3ef95 Mon Sep 17 00:00:00 2001
From: Yinggong ZHAO <yinggong.zhao@gmail.com>
Date: Mon, 6 Jul 2015 20:56:16 -0700
Subject: [PATCH 4/6] Finish GPU NCE training

---
 .../LMSequenceReader/SequenceReader.cpp       |  2 +-
 MachineLearning/CNTK/SGD.h                    |  2 -
 MachineLearning/CNTK/TrainingCriterionNodes.h | 15 +++---
 Math/Math/CPUMatrix.cpp                       | 12 ++++-
 Math/Math/GPUMatrix.cu                        | 18 ++++++-
 Math/Math/GPUMatrix.h                         |  2 +-
 Math/Math/GPUMatrixCUDAKernels.cu             | 53 +++++++++++++++++++
 Math/Math/Matrix.cpp                          |  6 ++-
 Math/Math/NoGPU.cpp                           |  4 ++
 9 files changed, 97 insertions(+), 17 deletions(-)

diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp
index a4241cb7a5df..bd617a2ca152 100644
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@@ -2052,7 +2052,7 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
     }
     if (curDevId != CPUDEVICE)
     {
-        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false);
     }
 }
 
diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index 45d51169f89e..fc2eb0cf1c4c 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -1523,8 +1523,6 @@ class SGD : ComputationNetworkHelper<ElemType>
         trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
 
         startReadMBTime = Timer::MilliSecondElapsed();
-        int a = 0;
-        if (a)
         while (trainSetDataReader->GetMinibatch(inputMatrices))
         {
 #ifdef MPI_SUPPORT
diff --git a/MachineLearning/CNTK/TrainingCriterionNodes.h b/MachineLearning/CNTK/TrainingCriterionNodes.h
index 14013951ca1e..803fcd3ecadb 100644
--- a/MachineLearning/CNTK/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTK/TrainingCriterionNodes.h
@@ -948,10 +948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (Inputs(0)->FunctionValues().GetNumRows() == 1)
             {
                 for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                if (Inputs(0)->FunctionValues()(0, i) > 0)
-                    positive++;
-                else if (Inputs(0)->FunctionValues()(0, i) < 0)
-                    negative++;
+                {
+                    if (Inputs(0)->FunctionValues()(0, i) > 0)
+                        positive++;
+                    else if (Inputs(0)->FunctionValues()(0, i) < 0)
+                        negative++;
+                }
                 assert(positive * negative == 0);
             }
             if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0))
@@ -960,10 +962,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
                 m_logSoftmax += Inputs(3)->FunctionValues();
                 m_logSoftmax.InplaceLogSoftmax(false);
-                FunctionValues().Resize(1, 1);
-                FunctionValues().SetValue(0);
-                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                    FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i));
+                FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax);
             }
             else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0))
             {
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index f36fbe0e3dc6..03379ebead61 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -3842,9 +3842,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
     }
     template<class ElemType>
-    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax)
+    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& softmax, CPUMatrix<ElemType>& c)
     {
-
+        ElemType log_likelihood = 0.0;
+        size_t batch_size = this->GetNumCols();
+#pragma omp parallel for reduction(+:log_likelihood)
+        for (int instance_id = 0; instance_id < batch_size; instance_id++)
+        {
+            int sample = (int)(*this)(0, instance_id);
+            log_likelihood += softmax(instance_id, sample);
+        }
+        c(0, 0) = -log_likelihood;
     }
 
     template<class ElemType>
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index a486eddfc278..5f36df1724e6 100755
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -1930,9 +1930,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (do_sync) CUDA_CALL(cudaEventDestroy(done));
     }
     template<class ElemType>
-    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax)
+    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
     {
+        UNCONST(ElemType, a, my_a);
+        cudaEvent_t done = nullptr;
+        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
+        int p = 512;
+        int width = a.GetNumRows();
+        while (p / 2 > width) p = p / 2;
 
+        _assignSoftmaxSum<ElemType> << <1, p >> >(
+            my_a.GetArray(),
+            width,
+            GetArray(),
+            c.GetArray()
+            );
+
+        if (do_sync) CUDA_CALL(cudaEventRecord(done));
+        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
     }
     template<class ElemType>
     void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index 65792fec02cf..f7314592a4b4 100755
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -294,7 +294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c);
         void AssignNCEDerivative(GPUMatrix<ElemType>& tmp, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, size_t inputIndex, GPUMatrix<ElemType>& c);    
         void AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-
+        void AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax);
 
         void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
         void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 6d37ede64092..c9a6c67bc0f0 100755
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -2868,6 +2868,59 @@ __global__ void _computeNceOutput(
     }
 }
 
+
+template<class ElemType>
+__global__ void _assignSoftmaxSum(
+    const ElemType* softmax,    
+    int sampleCount,
+    const ElemType* a, 
+    ElemType* c) // run on 512 threads per block
+{
+    // val and col are in CSR format
+    // val is an array contains log_Pn(w). To differentiate positive and negative samples, 
+    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
+    // col is an array contains index of the word samples
+    // a is a matrix in column major format contains output from hidden layer
+    // b is the weight matrix for output layer
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
+    // c is the matrix to store objective
+
+    __shared__ ElemType partials[512];
+    partials[threadIdx.x] = 0;
+
+    int total = sampleCount;
+    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
+
+    // find out the items this thread is responsible for
+    int start = loadPerThread * threadIdx.x;
+    int end = min(total, loadPerThread * (threadIdx.x + 1));    
+    for (int i = start; i < end; i++)
+    {
+        int wid = (int)a[i];
+        partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)];
+    }
+
+    __syncthreads();
+
+    // now sum up the objective function
+    int nTotalThreads = blockDim.x;
+
+    while (nTotalThreads >1)
+    {
+        int halfPoint = (nTotalThreads >> 1);
+
+        if (threadIdx.x < halfPoint)
+            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
+
+        __syncthreads();
+
+        nTotalThreads = (nTotalThreads >> 1);
+    }
+
+    if (threadIdx.x == 0)
+        c[0] = -partials[0];
+}
+
 template<class ElemType>
 __global__ void _assignNoiseContrastiveEstimation(
     const ElemType* val,
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index fd42fae109d5..75cdd2f9b11a 100755
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 #define NUM_MATRIXTYPE_CHANGED_WARN 20
         m_numTimesMatrixTypeChanged++;
+     
         if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
-            fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
-
+        {            
+            fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);         
+        }      
         if (GetDeviceId()<0) //CPU
         {
             if (newMatrixType==MatrixType::SPARSE)
diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp
index 47aba454eb7c..b9a3c09d1d0d 100644
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@@ -1067,6 +1067,10 @@ namespace Microsoft {
 
             }
 
+            template<class ElemType>
+            void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+            {
+            }
 
             template<class ElemType>
             void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)

From 4d14e516df410d3027413165acae76d01a4f6817 Mon Sep 17 00:00:00 2001
From: Yinggong ZHAO <yinggong.zhao@gmail.com>
Date: Tue, 7 Jul 2015 00:20:24 -0700
Subject: [PATCH 5/6] Fix a bug in restoring from previous model in SGD

---
 MachineLearning/CNTK/SGD.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index fc2eb0cf1c4c..cac467763bd5 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -1862,7 +1862,7 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
         {
-            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
+            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
             fstream >> minibatchSize;
             fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
         }

From 496dfcb4c2665e562787b402982d022778f2fc15 Mon Sep 17 00:00:00 2001
From: Chris Basoglu <cbasoglu@microsoft.com>
Date: Tue, 7 Jul 2015 10:52:06 -0700
Subject: [PATCH 6/6] Cleanup after Adaptive Minibatch size change

---
 MachineLearning/CNTK/SGD.h | 122 +++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
index cac467763bd5..bc734c4e907a 100644
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
@@ -669,8 +669,12 @@ class SGD : ComputationNetworkHelper<ElemType>
         bool learnRateInitialized = false;
         if (startEpoch > 0)
         {
-            learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen,
-                                                      learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
+            learnRateInitialized = LoadCheckPointInfo(startEpoch - 1,
+                                                      /*out*/ totalSamplesSeen,
+                                                      /*out*/ learnRatePerSample,
+                                                      smoothedGradients,
+                                                      /*out*/ prevCriterion,
+                                                      /*out*/ m_prevChosenMinibatchSize);
             if (learnRateInitialized)
             {
                 prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
@@ -908,9 +912,12 @@ class SGD : ComputationNetworkHelper<ElemType>
                         net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i - 1),
                                                               m_validateAfterModelReloading);
                         net.ResetEvalTimeStamp();
-                        LoadCheckPointInfo(i - 1, totalSamplesSeen,
-                                           learnRatePerSample,
-                                           smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
+                        LoadCheckPointInfo(i - 1,
+                                           /*out*/ totalSamplesSeen,
+                                           /*out*/ learnRatePerSample,
+                                           smoothedGradients,
+                                           /*out*/ prevCriterion,
+                                           /*out*/ m_prevChosenMinibatchSize);
                         fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
                         loadedPrevModel = true;
                     }
@@ -989,12 +996,12 @@ class SGD : ComputationNetworkHelper<ElemType>
             }
         }
 
-        //since we linked feature nodes. we need to remove it from the deletion
+        // since we linked feature nodes. we need to remove it from the deletion
         if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
         {
             for (size_t i = 0; i < refFeatureNodes.size(); i++)
             {
-                //note we need to handle deletion carefully
+                // note we need to handle deletion carefully
                 refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]);
             }
         }
@@ -1054,7 +1061,7 @@ class SGD : ComputationNetworkHelper<ElemType>
             }
         }
 
-        //mark done
+        // mark done
         for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
         {
             PreComputedNode<ElemType>* node = static_cast<PreComputedNode<ElemType>*>(*nodeIter);
@@ -1111,8 +1118,12 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         ElemType learnRate = learnRatePerSample;
         size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate,
-                           smoothedGradients, prevCriterion, dummyMinibatchSize);
+        LoadCheckPointInfo(baseModelEpoch,
+                           /*out*/ totalSamplesSeen,
+                           /*out*/ learnRate,
+                           smoothedGradients,
+                           /*out*/ prevCriterion,
+                           /*out*/ dummyMinibatchSize);
 
         // if model is not changed this is what we will get
         TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber,
@@ -1120,8 +1131,9 @@ class SGD : ComputationNetworkHelper<ElemType>
                                         FeatureNodes, labelNodes,
                                         criterionNodes, evaluationNodes,
                                         inputMatrices, learnableNodes,
-                                        smoothedGradients, baseCriterion,
-                                        epochEvalErrors, totalSamplesSeen, "BaseAdaptiveLearnRateSearch:");
+                                        smoothedGradients, /*out*/ baseCriterion,
+                                        /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                        "BaseAdaptiveLearnRateSearch:");
 
         if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
         {
@@ -1149,8 +1161,8 @@ class SGD : ComputationNetworkHelper<ElemType>
                                             labelNodes, criterionNodes,
                                             evaluationNodes, inputMatrices,
                                             learnableNodes, smoothedGradients,
-                                            epochCriterion, epochEvalErrors,
-                                            totalSamplesSeen, "AdaptiveLearnRateSearch:");
+                                            /*out*/ epochCriterion, /*out*/ epochEvalErrors,
+                                            /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:");
 
         }
         while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate));
@@ -1170,8 +1182,9 @@ class SGD : ComputationNetworkHelper<ElemType>
                                             FeatureNodes, labelNodes,
                                             criterionNodes, evaluationNodes,
                                             inputMatrices, learnableNodes,
-                                            smoothedGradients, leftCriterion,
-                                            epochEvalErrors, totalSamplesSeen, "DetailBaseAdaptiveLearnRateSearch:");
+                                            smoothedGradients, /*out*/ leftCriterion,
+                                            /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen,
+                                            "DetailBaseAdaptiveLearnRateSearch:");
 
             while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f)
             {
@@ -1189,9 +1202,10 @@ class SGD : ComputationNetworkHelper<ElemType>
                                                     inputMatrices,
                                                     learnableNodes,
                                                     smoothedGradients,
-                                                    rightCriterion,
-                                                    epochEvalErrors,
-                                                    totalSamplesSeen, "DetailRightAdaptiveLearnRateSearch:");
+                                                    /*out*/ rightCriterion,
+                                                    /*out*/ epochEvalErrors,
+                                                    /*out*/ totalSamplesSeen,
+                                                    "DetailRightAdaptiveLearnRateSearch:");
                 }
                 else
                 {
@@ -1207,9 +1221,10 @@ class SGD : ComputationNetworkHelper<ElemType>
                                                     inputMatrices,
                                                     learnableNodes,
                                                     smoothedGradients,
-                                                    leftCriterion,
-                                                    epochEvalErrors,
-                                                    totalSamplesSeen, "DetailLeftAdaptiveLearnRateSearch:");
+                                                    /*out*/ leftCriterion,
+                                                    /*out*/ epochEvalErrors,
+                                                    /*out*/ totalSamplesSeen,
+                                                    "DetailLeftAdaptiveLearnRateSearch:");
                 }
             }
 
@@ -1235,8 +1250,9 @@ class SGD : ComputationNetworkHelper<ElemType>
                                          const std::vector<ComputationNodePtr>& evaluationNodes,
                                          std::map<std::wstring, Matrix<ElemType>*>& inputMatrices,
                                          const std::list<ComputationNodePtr>& learnableNodes,
-                                         /*out*/ std::list<Matrix<ElemType>>& smoothedGradients,
-                                         /*out*/ ElemType& epochCriterion, std::vector<ElemType>& epochEvalErrors,
+                                         std::list<Matrix<ElemType>>& smoothedGradients,
+                                         /*out*/ ElemType& epochCriterion,
+                                         /*out*/ std::vector<ElemType>& epochEvalErrors,
                                          /*out*/ size_t& totalSamplesSeen,
                                          std::string prefixMsg = "")
     {
@@ -1273,8 +1289,12 @@ class SGD : ComputationNetworkHelper<ElemType>
         ElemType dummyLearnRate;
         ElemType dummtPrevCriterion;
         size_t dummyMinibatchSize = 0;
-        LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, dummyLearnRate,
-                           smoothedGradients, dummtPrevCriterion, dummyMinibatchSize);
+        LoadCheckPointInfo(baseModelEpoch,
+                           /*out*/ totalSamplesSeen,
+                           /*out*/ dummyLearnRate,
+                           smoothedGradients,
+                           /*out*/ dummtPrevCriterion,
+                           /*out*/ dummyMinibatchSize);
     }
 
     size_t AdaptiveMinibatchSizing(ComputationNetwork<ElemType>& net,
@@ -1501,14 +1521,14 @@ class SGD : ComputationNetworkHelper<ElemType>
         unsigned long long startReadMBTime = 0, startComputeMBTime = 0;
         unsigned long long endReadMBTime = 0, endComputeMBTime = 0;
 
-        //initialize statistics
+        // initialize statistics
         size_t totalEpochSamples = 0;
 
         int numMBsRun = 0;
 
         size_t numEvalNodes = epochEvalErrors.size();
 
-        //assume only one training criterion node for each epoch
+        // assume only one training criterion node for each epoch
 
         Matrix<ElemType> localEpochCriterion(1, 1, net.GetDeviceID());
         Matrix<ElemType> localEpochEvalErrors(1, numEvalNodes, net.GetDeviceID());
@@ -1550,7 +1570,7 @@ class SGD : ComputationNetworkHelper<ElemType>
                 throw std::logic_error("cannot pass gradient checker");
             }
 #endif
-            //TODO: currently only support one node regularization
+            // TODO: currently only support one node regularization
             if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr)
             {
                 refNet.SetActualMiniBatchSize(actualMBSize);
@@ -1562,15 +1582,15 @@ class SGD : ComputationNetworkHelper<ElemType>
                                               labelNodes[0]->FunctionValues());
             }
 
-            //only compute gradient when learning rate is large enough
+            // only compute gradient when learning rate is large enough
             if (learnRatePerSample > m_minLearnRate * 0.01)
             {
-                //use only the first criterion. Is there any possibility to use more?
+                // use only the first criterion. Is there any possibility to use more?
                 net.ComputeGradient(criterionNodes[0]);
             }
             else
             {
-                //use only the first criterion. Is there any possibility to use more?
+                // use only the first criterion. Is there any possibility to use more?
                 net.Evaluate(criterionNodes[0]);
             }
 
@@ -1656,8 +1676,8 @@ class SGD : ComputationNetworkHelper<ElemType>
                 break;
             }
 
-            /// call DataEnd function
-            /// DataEnd does reader specific process if sentence ending is reached
+            // call DataEnd function
+            // DataEnd does reader specific process if sentence ending is reached
             trainSetDataReader->DataEnd(endDataSentence);
 
             profiler.NextSample();
@@ -1715,7 +1735,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         // L2 regularizer
         if (L2RegWeight > 0)
         {
-            //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
             Matrix<ElemType>::ScaleAndAdd(L2RegWeight * actualMBSize, functionValues, gradientValues);
         }
 
@@ -1723,7 +1743,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         {
             ElemType momentum = sgd->MomentumPerMB();
 
-            //we use simple linear (instead of log linear) scaling here
+            // we use simple linear (instead of log linear) scaling here
             if (actualMBSize < expectedMBSize && momentum > 0.0000001f)
             {
                 momentum = (ElemType) exp(log(momentum) / expectedMBSize * actualMBSize);
@@ -1755,7 +1775,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         // L1 regularizer with proximal gradient descent method
         if (L1RegWeight > 0)
         {
-            //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
             functionValues.InplaceSoftThreshold(learnRatePerSample * L1RegWeight * actualMBSize);
         }
 
@@ -1794,7 +1814,7 @@ class SGD : ComputationNetworkHelper<ElemType>
             }
             else
             {
-                //norm2 normalized
+                // norm2 normalized
                 ElemType gradientNorm = gradient.FrobeniusNorm();
                 if (gradientNorm > maxGradientPerMB)
                 {
@@ -1829,7 +1849,7 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
         {
-            const Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
+            const Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
             fstream << smoothedGradient;
         }
 
@@ -1838,11 +1858,12 @@ class SGD : ComputationNetworkHelper<ElemType>
         fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP");
     }
 
-    bool LoadCheckPointInfo(const size_t epochNumber, size_t& totalSamplesSeen,
-                            ElemType& learnRatePerSample,
+    bool LoadCheckPointInfo(const size_t epochNumber,
+                            /*out*/ size_t& totalSamplesSeen,
+                            /*out*/ ElemType& learnRatePerSample,
                             std::list<Matrix<ElemType>>& smoothedGradients,
-                            ElemType& prevCriterion,
-                            size_t& minibatchSize)
+                            /*out*/ ElemType& prevCriterion,
+                            /*out*/ size_t& minibatchSize)
     {
         wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber));
         if (!fexists(checkPointFileName.c_str()))
@@ -1862,20 +1883,19 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"))
         {
-            //fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize");
             fstream >> minibatchSize;
             fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize");
         }
         else
         {
-            minibatchSize =m_mbSize[epochNumber];
+            minibatchSize = m_mbSize[epochNumber];
         }
 
         fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
 
         for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
         {
-            Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
+            Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
             fstream >> smoothedGradient;
         }
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
@@ -1905,12 +1925,12 @@ class SGD : ComputationNetworkHelper<ElemType>
 
     }
 
-    //return -1 if nothing exists
+    // return -1 if nothing exists
     int DetermineStartEpoch(const bool makeMode)
     {
         if (!makeMode)
         {
-            //always start from scratch
+            // always start from scratch
             return -1;
         }
 
@@ -2048,7 +2068,7 @@ class SGD : ComputationNetworkHelper<ElemType>
 
                 node->UpdateEvalTimeStamp();
 
-                //use only the first criterion. Is
+                // use only the first criterion. Is
                 net.ComputeGradient(criterionNodes[npos]);
 
                 if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
@@ -2092,7 +2112,7 @@ class SGD : ComputationNetworkHelper<ElemType>
                 node->UpdateEvalTimeStamp();
                 net.Evaluate(criterionNodes[npos]);
 
-                //criterionNode should be a scalar
+                // criterionNode should be a scalar
                 ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element();
 
                 // back to its orginal parameter value
@@ -2168,7 +2188,7 @@ class SGD : ComputationNetworkHelper<ElemType>
     ElemType m_reduceLearnRateIfImproveLessThan;
     bool m_continueReduce;
 
-    //determine after how many epochs the learning rate should be auto adjusted.
+    // determine after how many epochs the learning rate should be auto adjusted.
     size_t m_learnRateAdjustInterval;
 
     ElemType m_increaseLearnRateIfImproveMoreThan;