Skip to content

Commit

Permalink
Added momentumPerSample config option to specify per sample momentum …
Browse files Browse the repository at this point in the history
…value

that allows using a fixed momentum setting per sample which automatically
scales as the minibatch size changes.
  • Loading branch information
amitaga committed Jul 11, 2015
1 parent 1d67bc7 commit dfa6382
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 42 deletions.
25 changes: 25 additions & 0 deletions Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
Original file line number Diff line number Diff line change
Expand Up @@ -1473,6 +1473,31 @@ momentumPerMB

\begin_layout Itemize

\emph on
momentumPerSample
\emph default

\begin_inset Index idx
status open

\begin_layout Plain Layout
momentumPerSample
\end_layout

\end_inset

: momentum per sample.
Useful when you want to keep the momentum per sample constant, i.e.,
automatically scales effective momentum for the minibatch when
the minibatch size is changed.
Can use syntax such as 0.9996*10:0.998 which means using the per sample
momentum 0.9996 for 10 epochs and then 0.998 for the rest.
momentumPerSample may be missing, for example, when momentumPerMB
is provided.
\end_layout

\begin_layout Itemize

\emph on
autoAdjust
\emph default
Expand Down
6 changes: 2 additions & 4 deletions MachineLearning/CNTK/MultiNetworksSGD.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace Microsoft {
using SGD::m_doUnitTest;
using SGD::m_learnRateAdjustInterval;
using SGD::m_mbSize;
using SGD::m_momentumInputPerMB;
using SGD::m_momentumPerSample;
using SGD::m_learningRatesPerSample;
using SGD::m_dropoutRates;
using SGD::m_autoLearnRateSearchType;
Expand Down Expand Up @@ -308,7 +308,6 @@ namespace Microsoft {
if (startEpoch > 0)
{
learnRateInitialized = this->LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
setMomentum(m_momentumInputPerMB[m_momentumInputPerMB.size() - 1]);
}

if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch)
Expand All @@ -331,7 +330,6 @@ namespace Microsoft {
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
{
learnRatePerSample = m_learningRatesPerSample[i];
setMomentum(m_momentumInputPerMB[i]);
}
else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
{
Expand Down Expand Up @@ -619,7 +617,7 @@ namespace Microsoft {
ComputationNodePtr node = (*nodeIter);
Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);

UpdateWeights(node, smoothedGradient, learnRatePerSample, actualMBSize, m_mbSize[epochNumber], m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
UpdateWeights(node, smoothedGradient, learnRatePerSample, m_momentumPerSample[epochNumber], actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
}
}

Expand Down
98 changes: 60 additions & 38 deletions MachineLearning/CNTK/SGD.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ class SGD : ComputationNetworkHelper<ElemType>
ConfigArray momentumPerMBStr = configSGD("momentumPerMB", "");
floatargvector momentumPerMB = momentumPerMBStr;

ConfigArray momentumPerSampleStr = configSGD("momentumPerSample", "");
floatargvector momentumPerSample = momentumPerSampleStr;

wstring modelPath = configSGD("modelPath");
wstring trainCriterionNodeName = configSGD("trainCriterionNodeName", "");
wstring evalCriterionNodeName = configSGD("evalCriterionNodeName", "");
Expand Down Expand Up @@ -247,7 +250,7 @@ class SGD : ComputationNetworkHelper<ElemType>
bool UsingAllDataForPreComputedNode = configSGD("UseAllDataForPreComputedNode", "true");

Init(learningRatesPerMB, learningRatesPerSample, mbSize, epochSize,
maxEpochs, modelPath, momentumPerMB,
maxEpochs, modelPath, momentumPerMB, momentumPerSample,
gradientClippingWithTruncation, clippingThresholdPerSample,
autoAdjustLRType, increaseLearnRateIfImproveMoreThan,
learnRateIncreaseFactor, reduceLearnRateIfImproveLessThan,
Expand All @@ -263,11 +266,6 @@ class SGD : ComputationNetworkHelper<ElemType>
autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax);
}

void setMomentum(float momentum)
{
m_momentumPerMB = (ElemType) momentum;
}

//autoLearnRateSearchType is applied only if the learning rate for the epoch is not specified in learningRatesPerMB and learningRatesPerSample
void Init(const floatargvector& learningRatesPerMB,
const floatargvector& learningRatesPerSample,
Expand All @@ -276,6 +274,7 @@ class SGD : ComputationNetworkHelper<ElemType>
const size_t maxEpochs,
const wstring& modelPath,
const floatargvector& momentumPerMB,
const floatargvector& momentumPerSample,
const bool gradientClippingWithTruncation = true,
const ElemType clippingThresholdPerSample = std::numeric_limits<ElemType>::infinity(),
const LearningRateSearchAlgorithm autoLearnRateSearchType = LearningRateSearchAlgorithm::None,
Expand Down Expand Up @@ -402,13 +401,44 @@ class SGD : ComputationNetworkHelper<ElemType>
m_needToNormalizeLRByParallUtterance = true;
}

m_momentumPerMB = 0.9f;
if (momentumPerMB.size() > 0)
if (momentumPerSample.size() > 0 && momentumPerMB.size() > 0)
{
throw std::invalid_argument(
"You specified both momentumPerSample and momentumPerMB. Please comment out one of them.");
}
else if (momentumPerSample.size() > 0)
{
m_momentumPerSample = momentumPerSample;
int momentumVectorSize = m_momentumPerSample.size();
for (int i = 0; i < momentumVectorSize; i++)
{
if ((m_momentumPerSample[i] >= 1) || (m_momentumPerSample[i] < 0))
{
throw std::invalid_argument("momentumPerSample must be in [0, 1).");
}
}
}
else if (momentumPerMB.size() > 0)
{
int momentumVectorSize = (int)max(momentumPerMB.size(), m_mbSize.size());
m_momentumPerSample.resize(momentumVectorSize);
for (int i = 0; i < momentumVectorSize; i++)
{
if ((momentumPerMB[i] >= 1) || (momentumPerMB[i] < 0))
{
throw std::invalid_argument("momentumPerMB must be in [0, 1).");
}

m_momentumPerSample[i] = exp(log(momentumPerMB[i]) / m_mbSize[i]);
}
}
else
{
m_momentumInputPerMB = momentumPerMB;
if (m_momentumInputPerMB[0] >= 1 || m_momentumInputPerMB[0] < 0)
int momentumVectorSize = m_mbSize.size();
m_momentumPerSample.resize(momentumVectorSize);
for (int i = 0; i < momentumVectorSize; i++)
{
throw std::invalid_argument("momentumPerMB must be in [0, 1).");
m_momentumPerSample[i] = exp(log(0.9f) / m_mbSize[i]);
}
}

Expand Down Expand Up @@ -767,8 +797,6 @@ class SGD : ComputationNetworkHelper<ElemType>
{
prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample;
}

setMomentum(m_momentumInputPerMB[startEpoch]);
}

if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch &&
Expand Down Expand Up @@ -797,8 +825,6 @@ class SGD : ComputationNetworkHelper<ElemType>
// set dropout rate
SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);

setMomentum(m_momentumInputPerMB[i]);

// learning rate adjustment
if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None ||
(m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i))
Expand Down Expand Up @@ -842,9 +868,6 @@ class SGD : ComputationNetworkHelper<ElemType>
#ifdef MPI_SUPPORT
INT32 mySamples = (INT32)
#endif
fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n",
i + 1, learnRatePerSample, m_momentumPerMB);

size_t chosenMinibatchSize;

// Through the command line or config file the user can set minibatch sizes on a per epoch
Expand Down Expand Up @@ -875,6 +898,9 @@ class SGD : ComputationNetworkHelper<ElemType>
chosenMinibatchSize = m_mbSize[i];
}

fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n",
i + 1, learnRatePerSample, MomentumPerMB(m_momentumPerSample[i], chosenMinibatchSize));

TrainOneEpoch(net, refNet, refNode, i, m_epochSize,
trainSetDataReader, learnRatePerSample, chosenMinibatchSize, FeatureNodes,
labelNodes, criterionNodes, evaluationNodes,
Expand Down Expand Up @@ -1738,7 +1764,7 @@ class SGD : ComputationNetworkHelper<ElemType>
Matrix<ElemType>& smoothedGradient = *smoothedGradientIter;

UpdateWeights(node, smoothedGradient, learnRatePerSample,
actualMBSize, m_mbSize[epochNumber],
m_momentumPerSample[epochNumber], actualMBSize,
m_L2RegWeight, m_L1RegWeight,
m_needAveMultiplier);
}
Expand Down Expand Up @@ -1826,16 +1852,19 @@ class SGD : ComputationNetworkHelper<ElemType>
Matrix<ElemType>& gradientValues,
Matrix<ElemType>& smoothedGradient,
const ElemType learnRatePerSample,
size_t actualMBSize, const size_t expectedMBSize,
const ElemType momentumPerSample,
size_t actualMBSize,
const ElemType L2RegWeight,
const ElemType L1RegWeight,
const bool needAveMultiplier)
{
// we use simple linear (instead of log linear) scaling here
const ElemType momentum = MomentumPerMB(momentumPerSample, actualMBSize);
#if DUMPOUTPUT
fprintf(stderr, "learnRatePerSample=%0.8f, actualMBSize=%ld, expectedMBSize=%ld\n",
learnRatePerSample, actualMBSize, expectedMBSize);
fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f, sgd->MomentumPerMB()=%0.8f\n",
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd(), sgd->MomentumPerMB());
fprintf(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
learnRatePerSample, momentum, actualMBSize);
fprintf(stderr, "sgd->GradUpdateType()=%d, sgd->GradientUpdateNoiseStd()=%0.8f\n",
sgd->GradUpdateType(), sgd->GradientUpdateNoiseStd());
gradientValues.Print("Gradient Input");
smoothedGradient.Print("Smoothed Gradient Input");
#endif
Expand Down Expand Up @@ -1867,13 +1896,6 @@ class SGD : ComputationNetworkHelper<ElemType>

if (adpType == GradientsUpdateType::None)
{
ElemType momentum = sgd->MomentumPerMB();

// we use simple linear (instead of log linear) scaling here
if (actualMBSize < expectedMBSize && momentum > 0.0000001f)
{
momentum = (ElemType) exp(log(momentum) / expectedMBSize * actualMBSize);
}
smoothedGradient.NormalGrad(gradientValues, functionValues,
learnRatePerSample, momentum);
}
Expand Down Expand Up @@ -1915,16 +1937,17 @@ class SGD : ComputationNetworkHelper<ElemType>
void UpdateWeights(const ComputationNodePtr node,
Matrix<ElemType>& smoothedGradient,
const ElemType learnRatePerSample,
const size_t actualMBSize, const size_t expectedMBSize,
const ElemType momentumPerSample,
const size_t actualMBSize,
const ElemType L2RegWeight, const ElemType L1RegWeight,
const bool needAveMultiplier) const
{
#if DUMPOUTPUT
fprintf(stderr, "Update_%ls\n",node->NodeName().c_str());
#endif
UpdateWeightsS(this, node->FunctionValues(), node->GradientValues(),
smoothedGradient, learnRatePerSample, actualMBSize,
expectedMBSize, L2RegWeight, L1RegWeight,
smoothedGradient, learnRatePerSample, momentumPerSample,
actualMBSize, L2RegWeight, L1RegWeight,
needAveMultiplier);
node->UpdateEvalTimeStamp();
}
Expand Down Expand Up @@ -2155,9 +2178,9 @@ class SGD : ComputationNetworkHelper<ElemType>
return m_gradType.mGaussianNoiseInjectStd;
}

ElemType MomentumPerMB() const
static ElemType MomentumPerMB(ElemType momentumPerSample, size_t minibatchSize)
{
return m_momentumPerMB;
return exp(log(momentumPerSample) * minibatchSize);
}

public:
Expand Down Expand Up @@ -2292,8 +2315,7 @@ class SGD : ComputationNetworkHelper<ElemType>
// the total number of epochs to run.
size_t m_maxEpochs;

floatargvector m_momentumInputPerMB;
ElemType m_momentumPerMB;
floatargvector m_momentumPerSample;
bool m_gradientClippingWithTruncation;
ElemType m_clippingThresholdPerSample;

Expand Down

0 comments on commit dfa6382

Please sign in to comment.