Added momentumPerSample config option to specify per sample momentum …

…value that allows using a fixed momentum setting per sample which automatically scales as the minibatch size changes.
AltasK · Jul 11, 2015 · dfa6382 · dfa6382
1 parent 1d67bc7
commit dfa6382
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 42 deletions.
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@@ -1473,6 +1473,31 @@ momentumPerMB
 
 \begin_layout Itemize
 
+\emph on
+momentumPerSample
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+momentumPerSample
+\end_layout
+
+\end_inset
+
+: momentum per sample.
+ Useful when you want to keep the momentum per sample constant, i.e.,
+ automatically scales effective momentum for the minibatch when
+ the minibatch size is changed.
+ Can use syntax such as 0.9996*10:0.998 which means using the per sample
+ momentum 0.9996 for 10 epochs and then 0.998 for the rest.
+ momentumPerSample may be missing, for example, when momentumPerMB
+ is provided.
+\end_layout
+
+\begin_layout Itemize
+
 \emph on
 autoAdjust
 \emph default

diff --git a/MachineLearning/CNTK/MultiNetworksSGD.h b/MachineLearning/CNTK/MultiNetworksSGD.h
@@ -46,7 +46,7 @@ namespace Microsoft {
                 using SGD::m_doUnitTest;
                 using SGD::m_learnRateAdjustInterval;
                 using SGD::m_mbSize;
-                using SGD::m_momentumInputPerMB;
+                using SGD::m_momentumPerSample;
                 using SGD::m_learningRatesPerSample;
                 using SGD::m_dropoutRates;
                 using SGD::m_autoLearnRateSearchType;
@@ -308,7 +308,6 @@ namespace Microsoft {
                     if (startEpoch > 0)
                     {
                         learnRateInitialized = this->LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
-                        setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size() - 1]);
                     }
 
                     if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
@@ -331,7 +330,6 @@ namespace Microsoft {
                         if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
                         {
                             learnRatePerSample = m_learningRatesPerSample[i];
-                            setMomentum(m_momentumInputPerMB[i]);
                         }
                         else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
                         {
@@ -619,7 +617,7 @@ namespace Microsoft {
                                 ComputationNodePtr node = (*nodeIter);
                                 Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
 
-                                UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
+                                UpdateWeights(node, smoothedGradient, learnRatePerSample, m_momentumPerSample[epochNumber], actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
                             }
                         }
 

diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h
@@ -185,6 +185,9 @@ class SGD : ComputationNetworkHelper<ElemType>
         ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
         floatargvector momentumPerMB = momentumPerMBStr;
 
+        ConfigArray momentumPerSampleStr = configSGD("momentumPerSample", "");
+        floatargvector momentumPerSample = momentumPerSampleStr;
+
         wstring modelPath = configSGD("modelPath");
         wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
         wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
@@ -247,7 +250,7 @@ class SGD : ComputationNetworkHelper<ElemType>
         bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");
 
         Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize,
-             maxEpochs, modelPath, momentumPerMB,
+             maxEpochs, modelPath, momentumPerMB, momentumPerSample,
              gradientClippingWithTruncation, clippingThresholdPerSample,
              autoAdjustLRType, increaseLearnRateIfImproveMoreThan,
              learnRateIncreaseFactor, reduceLearnRateIfImproveLessThan,
@@ -263,11 +266,6 @@ class SGD : ComputationNetworkHelper<ElemType>
              autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax);
     }
 
-    void setMomentum(float momentum)
-    {
-        m_momentumPerMB = (ElemType) momentum;
-    }
-
     //autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
     void Init(const floatargvector& learningRatesPerMB,
               const floatargvector& learningRatesPerSample,
@@ -276,6 +274,7 @@ class SGD : ComputationNetworkHelper<ElemType>
               const size_t maxEpochs,
               const wstring& modelPath,
               const floatargvector& momentumPerMB,
+              const floatargvector& momentumPerSample,
               const bool gradientClippingWithTruncation = true,
               const ElemType clippingThresholdPerSample = std::numeric_limits<ElemType>::infinity(),
               const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None,
@@ -402,13 +401,44 @@ class SGD : ComputationNetworkHelper<ElemType>
             m_needToNormalizeLRByParallUtterance = true;
         }
 
-        m_momentumPerMB = 0.9f;
-        if (momentumPerMB.size() > 0)
+        if (momentumPerSample.size() > 0 && momentumPerMB.size() > 0)
+        {
+            throw std::invalid_argument(
+                "You specified both momentumPerSample and momentumPerMB. Please comment out one of them.");
+        }
+        else if (momentumPerSample.size() > 0)
+        {
+            m_momentumPerSample = momentumPerSample;
+            int momentumVectorSize = m_momentumPerSample.size();
+            for (int i = 0; i < momentumVectorSize; i++)
+            {
+                if ((m_momentumPerSample[i] >= 1) || (m_momentumPerSample[i] < 0))
+                {
+                    throw std::invalid_argument("momentumPerSample must be in [0, 1).");
+                }
+            }
+        }
+        else if (momentumPerMB.size() > 0)
+        {
+            int momentumVectorSize = (int)max(momentumPerMB.size(), m_mbSize.size());
+            m_momentumPerSample.resize(momentumVectorSize);
+            for (int i = 0; i < momentumVectorSize; i++)
+            {
+                if ((momentumPerMB[i] >= 1) || (momentumPerMB[i] < 0))
+                {
+                    throw std::invalid_argument("momentumPerMB must be in [0, 1).");
+                }
+
+                m_momentumPerSample[i] = exp(log(momentumPerMB[i]) / m_mbSize[i]);
+            }
+        }
+        else
         {
-            m_momentumInputPerMB = momentumPerMB;
-            if (m_momentumInputPerMB[0] >= 1 || m_momentumInputPerMB[0] < 0)
+            int momentumVectorSize = m_mbSize.size();
+            m_momentumPerSample.resize(momentumVectorSize);
+            for (int i = 0; i < momentumVectorSize; i++)
             {
-                throw std::invalid_argument("momentumPerMB must be in [0, 1).");
+                m_momentumPerSample[i] = exp(log(0.9f) / m_mbSize[i]);
             }
         }
 
@@ -767,8 +797,6 @@ class SGD : ComputationNetworkHelper<ElemType>
             {
                 prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
             }
-
-            setMomentum(m_momentumInputPerMB[startEpoch]);
         }
 
         if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
@@ -797,8 +825,6 @@ class SGD : ComputationNetworkHelper<ElemType>
             // set dropout rate
             SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
 
-            setMomentum(m_momentumInputPerMB[i]);
-
             // learning rate adjustment
             if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None ||
                 (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
@@ -842,9 +868,6 @@ class SGD : ComputationNetworkHelper<ElemType>
 #ifdef MPI_SUPPORT
             INT32 mySamples = (INT32)
 #endif
-            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
-                    i + 1, learnRatePerSample, m_momentumPerMB);
-
             size_t chosenMinibatchSize;
 
             // Through the command line or config file the user can set minibatch sizes on a per epoch
@@ -875,6 +898,9 @@ class SGD : ComputationNetworkHelper<ElemType>
                 chosenMinibatchSize = m_mbSize[i];
             }
 
+            fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f  momentum = %f \n",
+                i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], chosenMinibatchSize));
+
             TrainOneEpoch(net, refNet, refNode, i, m_epochSize,
                           trainSetDataReader, learnRatePerSample, chosenMinibatchSize, FeatureNodes,
                           labelNodes, criterionNodes, evaluationNodes,
@@ -1738,7 +1764,7 @@ class SGD : ComputationNetworkHelper<ElemType>
                     Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;
 
                     UpdateWeights(node, smoothedGradient, learnRatePerSample,
-                                  actualMBSize, m_mbSize[epochNumber],
+                                  m_momentumPerSample[epochNumber], actualMBSize,
                                   m_L2RegWeight, m_L1RegWeight,
                                   m_needAveMultiplier);
                 }
@@ -1826,16 +1852,19 @@ class SGD : ComputationNetworkHelper<ElemType>
                                Matrix<ElemType>& gradientValues,
                                Matrix<ElemType>& smoothedGradient,
                                const ElemType learnRatePerSample,
-                               size_t actualMBSize, const size_t expectedMBSize,
+                               const ElemType momentumPerSample,
+                               size_t actualMBSize,
                                const ElemType L2RegWeight,
                                const ElemType L1RegWeight,
                                const bool needAveMultiplier)
     {
+        // we use simple linear (instead of log linear) scaling here
+        const ElemType momentum = MomentumPerMB(momentumPerSample, actualMBSize);
 #if DUMPOUTPUT
-        fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",
-                learnRatePerSample, actualMBSize, expectedMBSize);
-        fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",
-                sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB());
+        fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+            learnRatePerSample, momentum, actualMBSize);
+        fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
+                sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
         gradientValues.Print("Gradient Input");
         smoothedGradient.Print("Smoothed Gradient Input");
 #endif
@@ -1867,13 +1896,6 @@ class SGD : ComputationNetworkHelper<ElemType>
 
         if (adpType == GradientsUpdateType::None)
         {
-            ElemType momentum = sgd->MomentumPerMB();
-
-            // we use simple linear (instead of log linear) scaling here
-            if (actualMBSize < expectedMBSize && momentum > 0.0000001f)
-            {
-                momentum = (ElemType) exp(log(momentum) / expectedMBSize * actualMBSize);
-            }
             smoothedGradient.NormalGrad(gradientValues, functionValues,
                                         learnRatePerSample, momentum);
         }
@@ -1915,16 +1937,17 @@ class SGD : ComputationNetworkHelper<ElemType>
     void UpdateWeights(const ComputationNodePtr node,
                        Matrix<ElemType>& smoothedGradient,
                        const ElemType learnRatePerSample,
-                       const size_t actualMBSize, const size_t expectedMBSize,
+                       const ElemType momentumPerSample,
+                       const size_t actualMBSize,
                        const ElemType L2RegWeight, const ElemType L1RegWeight,
                        const bool needAveMultiplier) const
     {
 #if DUMPOUTPUT
         fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
 #endif
         UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(),
-                       smoothedGradient, learnRatePerSample, actualMBSize,
-                       expectedMBSize, L2RegWeight, L1RegWeight,
+                       smoothedGradient, learnRatePerSample, momentumPerSample,
+                       actualMBSize, L2RegWeight, L1RegWeight,
                        needAveMultiplier);
         node->UpdateEvalTimeStamp();
     }
@@ -2155,9 +2178,9 @@ class SGD : ComputationNetworkHelper<ElemType>
         return m_gradType.mGaussianNoiseInjectStd;
     }
 
-    ElemType MomentumPerMB() const
+    static ElemType MomentumPerMB(ElemType momentumPerSample, size_t minibatchSize)
     {
-        return m_momentumPerMB;
+        return exp(log(momentumPerSample) * minibatchSize);
     }
 
 public:
@@ -2292,8 +2315,7 @@ class SGD : ComputationNetworkHelper<ElemType>
     // the total number of epochs to run.
     size_t m_maxEpochs;
 
-    floatargvector m_momentumInputPerMB;
-    ElemType m_momentumPerMB;
+    floatargvector m_momentumPerSample;
     bool m_gradientClippingWithTruncation;
     ElemType m_clippingThresholdPerSample;