diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h index ffe11532829a..966ae4151c47 100644 --- a/Common/Include/DataReader.h +++ b/Common/Include/DataReader.h @@ -29,9 +29,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { -const size_t randomizeAuto = ((size_t)-1)>>2; // randomize range set automatically, parameter value for Init() -const size_t randomizeNone = 0; // don't randomize, parameter value for Init() -const size_t requestDataSize = randomizeAuto; // StartMinibatchLoop default parameter, sets number of requested frames equal to the number of frames in the dataset +// randomize range set automatically, parameter value for Init() +const size_t randomizeAuto = ((size_t) -1) >> 2; + +// don't randomize, parameter value for Init() +const size_t randomizeNone = 0; + +// StartMinibatchLoop default parameter, sets number of requested +// frames equal to the constant 3fffffffffffffff computed by ((size_t) -1) >> 2 above. +// We use this constant as a stand in for the total number of frames in the dataset. +const size_t requestDataSize = randomizeAuto; enum EndDataType { @@ -48,7 +55,7 @@ class DATAREADER_API IDataReader { public: typedef std::string LabelType; - typedef unsigned LabelIdType; + typedef unsigned int LabelIdType; unsigned m_seed; size_t mBlgSize; /// number of utterances per minibatch bool mDoRandomize; diff --git a/Common/Include/commandArgUtil.h b/Common/Include/commandArgUtil.h index 507a4cb1240c..c54a65fcd393 100644 --- a/Common/Include/commandArgUtil.h +++ b/Common/Include/commandArgUtil.h @@ -612,6 +612,7 @@ class ConfigParser // pop out of content level contentLevel = false; } + if (quoteFound) { // skip the closing quote @@ -660,7 +661,7 @@ class ConfigParser std::string ReadConfigFiles(const std::string& filePaths); std::string ReadConfigFiles(const std::wstring& filePaths); std::string ResolveIncludeStatements(const std::string& configString, std::vector& resolvedConfigFiles); - void LoadConfigFile(const std::wstring & filePath); + void LoadConfigFile(const std::wstring& filePath); void LoadConfigFileAndResolveVariables(const std::wstring& filePath, const ConfigParameters& config); void LoadConfigFiles(const std::wstring& filePaths, const std::string* configStringToAppend = nullptr); @@ -873,17 +874,17 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary } // Insert - insert an 'name=value' string into the dictionary - void Insert(const std::string &str) + void Insert(const std::string& str) { ParseValue(str, 0, str.length()); } - bool Exists(const std::wstring & name) const + bool Exists(const std::wstring& name) const { return Exists(msra::strfun::utf8(name)); } - bool Exists(const std::string & name) const + bool Exists(const std::string& name) const { if (find(name) != end()) { @@ -899,42 +900,42 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary } // ExistsCurrent - check to see if a key exists in THIS config, don't check parent - bool ExistsCurrent(const std::string & name) const + bool ExistsCurrent(const std::string& name) const { return (find(name) != end()); } // dict(name, default) for strings - ConfigValue operator()(const std::wstring & name, - const wchar_t *defaultvalue) const + ConfigValue operator()(const std::wstring& name, + const wchar_t* defaultvalue) const { return operator()(msra::strfun::utf8(name), defaultvalue); } // dict(name, default) for strings - ConfigValue operator()(const std::string & name, - const wchar_t *defaultvalue) const + ConfigValue operator()(const std::string& name, + const wchar_t* defaultvalue) const { return operator()(name, msra::strfun::utf8(defaultvalue).c_str()); } // dict(name, default) for strings - ConfigValue operator()(const std::wstring & name, - const char *defaultvalue) const + ConfigValue operator()(const std::wstring& name, + const char* defaultvalue) const { return operator()(msra::strfun::utf8(name), defaultvalue); } // dict(name, default) for strings - ConfigValue operator()(const std::string & name, - const char *defaultvalue) const + ConfigValue operator()(const std::string& name, + const char* defaultvalue) const { ConfigValue value = Find(name, defaultvalue); return value; } - ConfigValue Find(const std::string & name, - const char *defaultvalue = NULL) const + ConfigValue Find(const std::string& name, + const char* defaultvalue = NULL) const { auto iter = find(name); ConfigValue result; @@ -975,10 +976,11 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary // any whitespace characters. If an opening "$" is found without a closing "$", an exception is thrown. // configString - the string that you would like to resolve variables in. // returns: A copy of 'configString' with all the variables resolved. - std::string ResolveVariablesInSingleLine(const std::string &configLine) const + std::string ResolveVariablesInSingleLine(const std::string& configLine) const { // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine'). - if (configLine.find_first_of("\n") != std::string::npos) { + if (configLine.find_first_of("\n") != std::string::npos) + { throw std::logic_error( "\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character"); } @@ -1053,7 +1055,7 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary // we shouldn't insert newlines where they didn't already exist. // configString - the string that you would like to resolve variables in. // returns: A copy of 'configString' with all the variables resolved. - std::string ResolveVariables(const std::string &configString) const + std::string ResolveVariables(const std::string& configString) const { std::string newConfigString; if (configString.find_first_of("\n") != std::string::npos) @@ -1347,14 +1349,14 @@ class argvector: public std::vector RuntimeError("argvector: invalid arg value"); } } - static void parse(const std::wstring & in, std::wstring & val) + static void parse(const std::wstring& in, std::wstring& val) { val = in; } public: // constructor --construct empty, then assign a wstring from command-line argument - void operator=(const std::wstring & arg) + void operator=(const std::wstring& arg) { clear(); // separate the arguments @@ -1387,7 +1389,7 @@ class argvector: public std::vector } // constructor --use this for setting default values - argvector(const std::wstring & arg) + argvector(const std::wstring& arg) { *this = arg; } @@ -1438,7 +1440,7 @@ class argvector: public std::vector } // we give full read access to the vector, so we can use it bounded as well - const std::vector & tovector() const + const std::vector& tovector() const { return *this; } diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp index 7f121009b01b..09c2b17866c2 100644 --- a/DataReader/LMSequenceReader/SequenceReader.cpp +++ b/DataReader/LMSequenceReader/SequenceReader.cpp @@ -2049,7 +2049,7 @@ void BatchSequenceReader::GetLabelOutput(std::mapTransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false); } } diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx index a387d6d9dba9..ebb01885eed5 100644 --- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx +++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx @@ -1725,6 +1725,102 @@ numBestSearchEpoch \end_layout +\begin_layout Standard +Used in the Adaptive Minibatch Sizing mode. +\end_layout + +\begin_layout Itemize + +\emph on +numMiniBatch4LRSearch +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +numMiniBatch4LRSearch +\end_layout + +\end_inset + +: the number of minibatches used to search the minibatch size when +in adaptive minibatch size mode. + Default value is 500. + It's typically set to 10-20% of the total minibatches in an epoch +this is shared with the search for learning rate in +SearchBeforeEpoch mode. + +\end_layout + +\begin_layout Itemize + +\emph on +autoAdjustMinibatch +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +autoAdjustMinibatch +\end_layout + +\end_inset + +: enable or disable whether minibatch size is adaptively adjusted. + Default value is false. +Adapative minibatch sizing will begin on +epochs starting after user minbatch sizes expcitily +specified are complete. For example if the user +specifed minibatchSize=256:1024, then 256 and 1024 +are used in the first 2 Epochs and adaptive minibatch +sizing is used aferwards + +\end_layout + +\begin_layout Itemize + +\emph on +minibatchSizeTuningFrequency +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +minibatchSizeTuningFrequency +\end_layout + +\end_inset + +: The number of epochs to skip, on a periodic basis, before +dynamically adjusting the minibatch size. + Default value is 1. + +\end_layout + +\begin_layout Itemize + +\emph on +minibatchSizeTuningMax +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +minibatchSizeTuningMax +\end_layout + +\end_inset + +: The maximum size allowed for an +adaptively adjusted minibatch size. + Default value is 1048576. + +\end_layout + \end_deeper \begin_layout Subsubsection Gradient control diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index 7b74cbd00440..e332b8447a44 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -126,7 +126,7 @@ typedef struct stGradientUpdateInfo } GradientUpdateInfo; template -class SGD: ComputationNetworkHelper +class SGD : ComputationNetworkHelper { protected: typedef ComputationNetworkHelper B; @@ -157,17 +157,31 @@ class SGD: ComputationNetworkHelper ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618"); ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF"); ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382"); + + // AutoAdjust Auto Adjust Minibatch Parameters + bool autoAdjustMinibatch = (bool) configAALR("autoAdjustMinibatch", "false"); + size_t minibatchSizeTuningFrequency = configAALR("minibatchSizeTuningFrequency", "1"); + size_t minibatchSizeTuningMax = configAALR("minibatchSizeTuningMax", "1048576"); + + // the number of minibatches used to search + // the learning rate. It’s typically set to 10-20% of + // the total minibatches in an epoch. ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500"); intargvector numMiniBatch4LRSearch = minibatch4LRSearch; + size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5"); size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1"); bool loadBestModel = configAALR("loadBestModel", "true"); ConfigArray minibatchSize = configSGD("minibatchSize", "256"); intargvector mbSize = minibatchSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t epochSize = configSGD("epochSize", "0"); + // the total number of epochs to run. size_t maxEpochs = configSGD("maxEpochs"); + ConfigArray momentumPerMBStr = configSGD("momentumPerMB", ""); floatargvector momentumPerMB = momentumPerMBStr; @@ -245,7 +259,8 @@ class SGD: ComputationNetworkHelper trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading, rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, - needAveMultiplier, L2RegWeight, L1RegWeight); + needAveMultiplier, L2RegWeight, L1RegWeight, + autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax); } void setMomentum(float momentum) @@ -292,15 +307,27 @@ class SGD: ComputationNetworkHelper const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0, - const ElemType L1RegWeight = 0) + const ElemType L1RegWeight = 0, + const bool autoAdjustMinibatch = false, + const size_t minibatchSizeTuningFrequency = 1, + const size_t minibatchSizeTuningMax = 1048576) { m_numPrevLearnRates = numPrevLearnRates; + m_prevChosenMinibatchSize = 0; + m_autoAdjustMinibatch = autoAdjustMinibatch; + m_minibatchSizeTuningMax = minibatchSizeTuningMax; + m_minibatchSizeTuningFrequency = minibatchSizeTuningFrequency; + m_mbSize = mbSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = epochSize; if (m_epochSize == 0) { m_epochSize = requestDataSize; } + + // the total number of epochs to run. m_maxEpochs = maxEpochs; m_gradientClippingWithTruncation = gradientClippingWithTruncation; @@ -351,7 +378,8 @@ class SGD: ComputationNetworkHelper (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0)) { throw std::invalid_argument( - "If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter."); + "If autoLearnRateSearchType is false you must specify the " + "learningRatesPerSample or learningRatesPerMB parameter."); } if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0) @@ -373,6 +401,7 @@ class SGD: ComputationNetworkHelper } m_needToNormalizeLRByParallUtterance = true; } + m_momentumPerMB = 0.9f; if (momentumPerMB.size() > 0) { @@ -638,8 +667,8 @@ class SGD: ComputationNetworkHelper IDataReader* trainSetDataReader, IDataReader* validationSetDataReader) { - std::vector & FeatureNodes = net.FeatureNodes(); - std::vector & labelNodes = net.LabelNodes(); + std::vector& FeatureNodes = net.FeatureNodes(); + std::vector& labelNodes = net.LabelNodes(); std::vector criterionNodes = GetTrainCriterionNodes(net); std::vector evaluationNodes = GetEvalCriterionNodes(net); @@ -698,6 +727,7 @@ class SGD: ComputationNetworkHelper size_t totalSamplesSeen = 0; ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch]; + ElemType learningRateAdjustmentFactor = 1.0f; vector prevLearnRates; prevLearnRates.resize(m_numPrevLearnRates); for (int i = 0; i < m_numPrevLearnRates; i++) @@ -727,8 +757,12 @@ class SGD: ComputationNetworkHelper bool learnRateInitialized = false; if (startEpoch > 0) { - learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, - learnRatePerSample, smoothedGradients, prevCriterion); + learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, + /*out*/ totalSamplesSeen, + /*out*/ learnRatePerSample, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ m_prevChosenMinibatchSize); if (learnRateInitialized) { prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample; @@ -741,7 +775,8 @@ class SGD: ComputationNetworkHelper !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch) { throw std::invalid_argument( - "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); + "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, " + "or an explicit learning rate must be specified in config for the starting epoch."); } unsigned long dropOutSeed = 1; @@ -755,17 +790,18 @@ class SGD: ComputationNetworkHelper SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN); } - for (int i = int(startEpoch); i < int(m_maxEpochs); i++) + for (int i = startEpoch; i < (int) m_maxEpochs; i++) { auto t_start_epoch = Timer::MilliSecondElapsed(); - //set dropout rate + // set dropout rate SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); setMomentum(m_momentumInputPerMB[i]); - //learning rate adjustment - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) + // learning rate adjustment + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || + (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) { learnRatePerSample = m_learningRatesPerSample[i]; } @@ -777,14 +813,16 @@ class SGD: ComputationNetworkHelper largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]); } - //return a reasonable learning rate based on the initial mbsize - learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, - trainSetDataReader, FeatureNodes, labelNodes, - criterionNodes, evaluationNodes, inputMatrices, - learnableNodes, smoothedGradients, learnRateInitialized, - largestPrevLearnRatePerSample); + // return a reasonable learning rate based on the initial minibatchSize + ElemType newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample, + trainSetDataReader, FeatureNodes, labelNodes, + criterionNodes, evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + learnRateInitialized, largestPrevLearnRatePerSample); + learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample; + learnRatePerSample = newLearningRatePerSample; - //save per sample learn rate to support changeable mbsize + // save per sample learn rate to support changeable minibatchSize prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample; } @@ -792,8 +830,7 @@ class SGD: ComputationNetworkHelper if (learnRatePerSample < m_minLearnRate) { - fprintf(stderr, - "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", + fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None) { @@ -806,10 +843,40 @@ class SGD: ComputationNetworkHelper INT32 mySamples = (INT32) #endif fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n", - i, learnRatePerSample, m_momentumPerMB); + i + 1, learnRatePerSample, m_momentumPerMB); + + size_t chosenMinibatchSize; + + // Through the command line or config file the user can set minibatch sizes on a per epoch + // basis for a set number of epochs. For epochs after that point, m_mbSize.size(), either + // we just keep using + // the last minibatch size, or we use tuning to try and find a better one. + if (m_autoAdjustMinibatch && i >= m_mbSize.size()) + { + size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i]; + if (m_epochSize != requestDataSize) + { + // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch + numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize); + } + + // Use tuning to try and find a better minibatch size + chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i, + numFramesToUseInSearch, + trainSetDataReader, learnRatePerSample, + m_mbSize[i], FeatureNodes, labelNodes, + criterionNodes, evaluationNodes, + inputMatrices, learnableNodes, + smoothedGradients, learningRateAdjustmentFactor); + } + else + { + // use the explicitly set minibatch size + chosenMinibatchSize = m_mbSize[i]; + } TrainOneEpoch(net, refNet, refNode, i, m_epochSize, - trainSetDataReader, learnRatePerSample, FeatureNodes, + trainSetDataReader, learnRatePerSample, chosenMinibatchSize, FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen); @@ -818,23 +885,23 @@ class SGD: ComputationNetworkHelper ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC); fprintf(stderr, - "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", + "Finished Epoch[%d]: [Training Set] TrainLossPerSample = %.8g; ", i + 1, epochCriterion); if (epochEvalErrors.size() == 1) { fprintf(stderr, - "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", + "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); } else { - fprintf(stderr, "EvalErr Per Sample "); + fprintf(stderr, "EvalErrPerSample "); for (size_t j = 0; j < epochEvalErrors.size(); j++) { - fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); + fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]); } - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", + fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n", learnRatePerSample, epochTime); fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion); @@ -898,7 +965,7 @@ class SGD: ComputationNetworkHelper cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); vector vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); - fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n", + fprintf(stderr, "Finished Epoch[%d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n", i + 1, vScore[0], vScore[1]); epochCriterion = vScore[0]; //the first one is the training criterion. @@ -933,9 +1000,12 @@ class SGD: ComputationNetworkHelper net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i - 1), m_validateAfterModelReloading); net.ResetEvalTimeStamp(); - LoadCheckPointInfo(i - 1, totalSamplesSeen, - learnRatePerSample, - smoothedGradients, prevCriterion); + LoadCheckPointInfo(i - 1, + /*out*/ totalSamplesSeen, + /*out*/ learnRatePerSample, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ m_prevChosenMinibatchSize); fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); loadedPrevModel = true; } @@ -999,7 +1069,7 @@ class SGD: ComputationNetworkHelper if (mpiRank == 0) { net.SaveToFile(GetModelNameForEpoch(i)); - SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); + SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize); if (!m_keepCheckPointFiles) { //delete previous checkpiont file to save space @@ -1007,18 +1077,19 @@ class SGD: ComputationNetworkHelper } } - if (learnRatePerSample < 1e-12) { + if (learnRatePerSample < 1e-12) + { fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); } } - //since we linked feature nodes. we need to remove it from the deletion + // since we linked feature nodes. we need to remove it from the deletion if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) { for (size_t i = 0; i < refFeatureNodes.size(); i++) { - //note we need to handle deletion carefully + // note we need to handle deletion carefully refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]); } } @@ -1026,8 +1097,7 @@ class SGD: ComputationNetworkHelper } protected: - - //return true if precomputation is executed. + // return true if precomputation is executed. bool PreCompute(ComputationNetwork& net, IDataReader* trainSetDataReader, std::vector& FeatureNodes, @@ -1053,7 +1123,8 @@ class SGD: ComputationNetworkHelper //trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize); // trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch // [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing - if (m_useAllDataForPreComputedNode) { + if (m_useAllDataForPreComputedNode) + { // using all the data trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); } @@ -1078,7 +1149,7 @@ class SGD: ComputationNetworkHelper } } - //mark done + // mark done for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) { PreComputedNode* node = static_cast*>(*nodeIter); @@ -1088,21 +1159,21 @@ class SGD: ComputationNetworkHelper return true; } - //return a reasonable initial learning rate based on the initial mbsize - ElemType SearchLearnRateBeforeEpoch(ComputationNetwork& net, - ComputationNetwork& refNet, - const ComputationNodePtr refNode, const int epochNumber, - const ElemType curLearnRate, - IDataReader* trainSetDataReader, - const std::vector& FeatureNodes, - const std::vector& labelNodes, - const std::vector& criterionNodes, - const std::vector& evaluationNodes, - std::map*>& inputMatrices, - const std::list& learnableNodes, - std::list>& smoothedGradients, - const bool learnRateInitialized, - const ElemType largestPrevLearnRatePerSample) + // return a reasonable initial learning rate based on the initial mbsize + ElemType SearchForBestLearnRate(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, const int epochNumber, + const ElemType curLearnRate, + IDataReader* trainSetDataReader, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const bool learnRateInitialized, + const ElemType largestPrevLearnRatePerSample) { ElemType epochCriterion = std::numeric_limits::infinity(); ElemType prevCriterion = std::numeric_limits::infinity(); @@ -1111,10 +1182,11 @@ class SGD: ComputationNetworkHelper size_t totalSamplesSeen = 0; ElemType bestLearnRatePerSample = curLearnRate; - size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; + size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; if (m_epochSize != requestDataSize) { - epochSize = min(epochSize, m_epochSize); //use a small number minibatches to make decision + // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch + numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize); } ElemType baseCriterion; @@ -1133,17 +1205,23 @@ class SGD: ComputationNetworkHelper net.ResetEvalTimeStamp(); ElemType learnRate = learnRatePerSample; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, - smoothedGradients, prevCriterion); - - //if model is not changed this is what we will get + size_t dummyMinibatchSize = 0; + LoadCheckPointInfo(baseModelEpoch, + /*out*/ totalSamplesSeen, + /*out*/ learnRate, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ dummyMinibatchSize); + + // if model is not changed this is what we will get TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, 0, + numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, baseCriterion, - epochEvalErrors, totalSamplesSeen); + smoothedGradients, /*out*/ baseCriterion, + /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + "BaseAdaptiveLearnRateSearch:"); if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) { @@ -1156,7 +1234,7 @@ class SGD: ComputationNetworkHelper if (m_epochSize != requestDataSize) { - ratio = pow(((ElemType) epochSize) / m_epochSize, 1.0f / 2); + ratio = pow(((ElemType) numFramesToUseInSearch) / m_epochSize, 1.0f / 2); } baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion); @@ -1166,13 +1244,13 @@ class SGD: ComputationNetworkHelper { learnRatePerSample *= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, - learnRatePerSample, FeatureNodes, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, - totalSamplesSeen); + /*out*/ epochCriterion, /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:"); } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate)); @@ -1187,13 +1265,14 @@ class SGD: ComputationNetworkHelper ElemType leftCriterion, rightCriterion = epochCriterion; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, - leftLearnRatePerSample, + numFramesToUseInSearch, trainSetDataReader, + leftLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, leftCriterion, - epochEvalErrors, totalSamplesSeen); + smoothedGradients, /*out*/ leftCriterion, + /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + "DetailBaseAdaptiveLearnRateSearch:"); while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f) { @@ -1202,36 +1281,38 @@ class SGD: ComputationNetworkHelper rightLearnRatePerSample *= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, - epochNumber, epochSize, + epochNumber, numFramesToUseInSearch, trainSetDataReader, - rightLearnRatePerSample, + rightLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - rightCriterion, - epochEvalErrors, - totalSamplesSeen); + /*out*/ rightCriterion, + /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + "DetailRightAdaptiveLearnRateSearch:"); } else { leftLearnRatePerSample /= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, - epochNumber, epochSize, + epochNumber, numFramesToUseInSearch, trainSetDataReader, - leftLearnRatePerSample, + leftLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - leftCriterion, - epochEvalErrors, - totalSamplesSeen); + /*out*/ leftCriterion, + /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + "DetailLeftAdaptiveLearnRateSearch:"); } } @@ -1250,6 +1331,7 @@ class SGD: ComputationNetworkHelper const ComputationNodePtr refNode, const int epochNumber, const size_t epochSize, IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + const size_t minibatchSize, const std::vector& FeatureNodes, const std::vector& labelNodes, const std::vector& criterionNodes, @@ -1257,41 +1339,244 @@ class SGD: ComputationNetworkHelper std::map*>& inputMatrices, const std::list& learnableNodes, std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, - size_t& totalSamplesSeen) + /*out*/ ElemType& epochCriterion, + /*out*/ std::vector& epochEvalErrors, + /*out*/ size_t& totalSamplesSeen, + std::string prefixMsg = "") { TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, - trainSetDataReader, learnRatePerSample, FeatureNodes, + trainSetDataReader, learnRatePerSample, minibatchSize, FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g ", + /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + prefixMsg); + + fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion); if (epochEvalErrors.size() == 1) { - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g\n", + fprintf(stderr, "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g\n", epochEvalErrors[0], learnRatePerSample); } else { - fprintf(stderr, "EvalErr Per Sample "); + fprintf(stderr, "EvalErrPerSample "); for (size_t i = 0; i < epochEvalErrors.size(); i++) { - fprintf(stderr, "[%lu] = %.8g ", i, epochEvalErrors[i]); + fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]); } - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g\n", learnRatePerSample); + fprintf(stderr, "Ave LearnRatePerSample = %.10g\n", learnRatePerSample); } int baseModelEpoch = epochNumber - 1; net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading); net.ResetEvalTimeStamp(); - ElemType learnRate; - ElemType prevCriterion; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, - smoothedGradients, prevCriterion); + ElemType dummyLearnRate; + ElemType dummtPrevCriterion; + size_t dummyMinibatchSize = 0; + LoadCheckPointInfo(baseModelEpoch, + /*out*/ totalSamplesSeen, + /*out*/ dummyLearnRate, + smoothedGradients, + /*out*/ dummtPrevCriterion, + /*out*/ dummyMinibatchSize); + } + + size_t AdaptiveMinibatchSizing(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t numFramesToUseInSearch, + IDataReader* trainSetDataReader, + const ElemType learnRatePerSample, + const size_t initialMinibatchSize, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const ElemType learningRateAdjustmentFactor) + { + size_t minMinibatchSize = initialMinibatchSize; + size_t chosenMinibatchSize = initialMinibatchSize; + + // do some pre-adjustment based on LR + // Basically we assume that the LR for epoch 1 is safe for mbsize. + // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size. + ElemType learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0]; + learningRateChangeSoFar *= learningRateAdjustmentFactor; + + // increasing by the full factor is found to be too aggressive; sqrt() seems more robust + learningRateChangeSoFar = sqrt(learningRateChangeSoFar); + + // LR was indeed reduced + if (learningRateChangeSoFar < 1.0f) + { + // we can safely increase MB size (note: this may be bigger than our max) + minMinibatchSize = (size_t) (minMinibatchSize / learningRateChangeSoFar); + } + + if (epochNumber < 2 && m_prevChosenMinibatchSize != 0) + { + // newly started training: any previous MB size stored in the model is to be ignored + fprintf (stderr, "before epoch .2, previous minibatchSize %d is " + "considered invalid -> resetting\n", m_prevChosenMinibatchSize); + m_prevChosenMinibatchSize = 0; + } + + // check if we need to skip + if (m_prevChosenMinibatchSize != 0 && + (epochNumber + 1) > m_minibatchSizeTuningFrequency && + (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0) + { + fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize " + "in epoch %d skipped, keeping minibatchSize of %d\n", + epochNumber + 1, m_prevChosenMinibatchSize); + chosenMinibatchSize = m_prevChosenMinibatchSize; + } + else + { + if (m_prevChosenMinibatchSize != 0) + { + // but we don't go lower than 0.5 * the chosen previous one + fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to " + "previous minibatchSize = (%d / 2)\n", m_prevChosenMinibatchSize); + minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2); + } + + size_t maxMinibatchSize = m_minibatchSizeTuningMax; + + // only grow at most 2 x compared to previous step + if (m_prevChosenMinibatchSize != 0.0f) + { + if (m_prevChosenMinibatchSize < chosenMinibatchSize) + { + m_prevChosenMinibatchSize = chosenMinibatchSize; + } + + fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to " + "previous minibatchSize %d*2\n", m_prevChosenMinibatchSize); + maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2); + } + + chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, FeatureNodes, + labelNodes, criterionNodes, + evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + minMinibatchSize, maxMinibatchSize); + } + + return chosenMinibatchSize; + } + + size_t RoundToMultipleOf64(float val) + { + return 64 * (size_t) ((val + 32) / 64); + } + + size_t RoundToMultipleOf64(size_t val) + { + return 64 * ((val + 32) / 64); + } + + // uses a small percentage of training data of minibatch to + // speculatively train with various MB sizes; then picks the best + size_t SearchForBestMinibatchSize(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t numFramesToUseInSearch, + IDataReader* trainSetDataReader, + const ElemType learnRatePerSample, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const size_t minMinibatchSize, const size_t maxMinibatchSize) + { + // may happen for automatically reduced learning rates + if (minMinibatchSize > maxMinibatchSize) + { + return maxMinibatchSize; + } + + size_t trialMinibatchSize = 0; + bool isFirstIteration = true; + ElemType baseCriterion = 0; + + // increase the minibatch size by a factor of sqrt(2) in each step. + const float minibatchSizeTuningFactor = sqrtf(2.0f); + + size_t lastTriedtrialMinibatchSize = 0; + for (float trialMinibatchSizeFloat = (float) minMinibatchSize; + trialMinibatchSizeFloat <= maxMinibatchSize; + trialMinibatchSizeFloat *= minibatchSizeTuningFactor) + { + // round mbsize to something meaningful + trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat); + + fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%d out of range %d..%d ...\n\n", + trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize)); + + size_t totalSamplesSeen; + std::vector epochEvalErrors(evaluationNodes.size(), std::numeric_limits::infinity()); + ElemType epochCriterion = std::numeric_limits::infinity(); + + // Train on a few minibatches and so we can observe the epochCriterion as we try increasing + // minibatches with iteration of this loop. + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, trialMinibatchSize, FeatureNodes, + labelNodes, criterionNodes, + evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + /*out*/ epochCriterion, /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : + "AdaptiveMinibatchSearch:"); + + if (isFirstIteration) + { + // for the first iteration of the loop only, set baseCriterion + // to the result we got from TrainOneMiniEpochAndReloadModel(). + baseCriterion = epochCriterion; + lastTriedtrialMinibatchSize = trialMinibatchSize; + isFirstIteration = false; + + fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion); + } + else if (!std::isnan(epochCriterion) && (epochCriterion > baseCriterion)) + { + fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Choose new minibatchSize of %d. " + "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n", + lastTriedtrialMinibatchSize, epochCriterion, baseCriterion); + + // As soon as we see the Criterion (a measure of error) start to get larger than the + // Criterion we started with, we stop. + // TODO: if this is too sensitive, we can add a margin on the bases of percentage of + // baseCriterion. + break; + } + else + { + lastTriedtrialMinibatchSize = trialMinibatchSize; + fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... " + "EpochCriterion = %.10g vs BaseCriterion = %.10g\n", + epochCriterion, baseCriterion); + } + } + + return lastTriedtrialMinibatchSize; } // Tries to compute derivatives for the whole utterances, which will be @@ -1325,9 +1610,12 @@ class SGD: ComputationNetworkHelper size_t TrainOneEpoch(ComputationNetwork& net, ComputationNetwork& refNet, - const ComputationNodePtr refNode, const int epochNumber, - const size_t epochSize, IDataReader* trainSetDataReader, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t epochSize, + IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + size_t tunedMBSize, const std::vector& FeatureNodes, const std::vector& labelNodes, const std::vector& criterionNodes, @@ -1335,8 +1623,10 @@ class SGD: ComputationNetworkHelper std::map*>& inputMatrices, const std::list& learnableNodes, std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, - size_t& totalSamplesSeen) + /*out*/ ElemType& epochCriterion, + /*out*/ std::vector& epochEvalErrors, + /*out*/ size_t& totalSamplesSeen, + std::string prefixMsg = "") { ElemType readTimeInMBs = 0; ElemType ComputeTimeInMBs = 0; @@ -1348,14 +1638,15 @@ class SGD: ComputationNetworkHelper unsigned long long startReadMBTime = 0, startComputeMBTime = 0; unsigned long long endReadMBTime = 0, endComputeMBTime = 0; - //initialize statistics + // initialize statistics size_t totalEpochSamples = 0; int numMBsRun = 0; size_t numEvalNodes = epochEvalErrors.size(); - //assume only one training criterion node for each epoch + // assume only one training criterion node for each epoch + Matrix localEpochCriterion(1, 1, net.GetDeviceID()); Matrix localEpochEvalErrors(1, numEvalNodes, net.GetDeviceID()); @@ -1366,7 +1657,7 @@ class SGD: ComputationNetworkHelper // resetting this, so profiling is performed for one epoch only m_numMBsToCUDAProfile = 0; - trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); + trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize); AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, FeatureNodes, inputMatrices); startReadMBTime = Timer::MilliSecondElapsed(); @@ -1397,7 +1688,7 @@ class SGD: ComputationNetworkHelper throw std::logic_error("cannot pass gradient checker"); } #endif - //TODO: currently only support one node regularization + // TODO: currently only support one node regularization if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) { refNet.SetActualMiniBatchSize(actualMBSize); @@ -1409,15 +1700,15 @@ class SGD: ComputationNetworkHelper labelNodes[0]->FunctionValues()); } - //only compute gradient when learning rate is large enough + // only compute gradient when learning rate is large enough if (learnRatePerSample > m_minLearnRate * 0.01) { - //use only the first criterion. Is there any possibility to use more? + // use only the first criterion. Is there any possibility to use more? net.ComputeGradient(criterionNodes[0]); } else { - //use only the first criterion. Is there any possibility to use more? + // use only the first criterion. Is there any possibility to use more? net.Evaluate(criterionNodes[0]); } @@ -1471,18 +1762,18 @@ class SGD: ComputationNetworkHelper epochEvalErrors[i] = (const ElemType) localEpochEvalErrors(0, i); } - fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d Train Loss Per Sample = %.8g ", - epochNumber + 1, numMBsRun - m_numMBsToShowResult + 1, - numMBsRun, numSamplesLastMBs, + fprintf(stderr, "%s Epoch[%d of %d]-Minibatch[%d-%d of %d]: SamplesSeen = %d; TrainLossPerSample = %.8g; ", + prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1, + numMBsRun, epochSize / tunedMBSize, numSamplesLastMBs, (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs); for (size_t i = 0; i < numEvalNodes; i++) { - fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g ", + fprintf(stderr, "EvalErr[%lu]PerSample = %.8g; ", i, (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs); } - fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n", + fprintf(stderr, "ReadDataTime = %.8g; ComputeTime=%.8g; TotalTimePerSample=%.8g\n", readTimeInMBs, ComputeTimeInMBs, (readTimeInMBs + ComputeTimeInMBs) / numSamplesLastMBs); @@ -1506,8 +1797,8 @@ class SGD: ComputationNetworkHelper break; } - /// call DataEnd function - /// DataEnd does reader specific process if sentence ending is reached + // call DataEnd function + // DataEnd does reader specific process if sentence ending is reached trainSetDataReader->DataEnd(endDataSentence); profiler.NextSample(); @@ -1565,7 +1856,7 @@ class SGD: ComputationNetworkHelper // L2 regularizer if (L2RegWeight > 0) { - //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample + // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample Matrix::ScaleAndAdd(L2RegWeight * actualMBSize, functionValues, gradientValues); } @@ -1573,7 +1864,7 @@ class SGD: ComputationNetworkHelper { ElemType momentum = sgd->MomentumPerMB(); - //we use simple linear (instead of log linear) scaling here + // we use simple linear (instead of log linear) scaling here if (actualMBSize < expectedMBSize && momentum > 0.0000001f) { momentum = (ElemType) exp(log(momentum) / expectedMBSize * actualMBSize); @@ -1603,8 +1894,9 @@ class SGD: ComputationNetworkHelper } // L1 regularizer with proximal gradient descent method - if (L1RegWeight > 0) { - //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample + if (L1RegWeight > 0) + { + // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample functionValues.InplaceSoftThreshold(learnRatePerSample * L1RegWeight * actualMBSize); } @@ -1643,7 +1935,7 @@ class SGD: ComputationNetworkHelper } else { - //norm2 normalized + // norm2 normalized ElemType gradientNorm = gradient.FrobeniusNorm(); if (gradientNorm > maxGradientPerMB) { @@ -1657,7 +1949,8 @@ class SGD: ComputationNetworkHelper void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, const std::list>& smoothedGradients, - const ElemType prevCriterion) + const ElemType prevCriterion, + const size_t minibatchSize) { wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); @@ -1669,11 +1962,15 @@ class SGD: ComputationNetworkHelper fstream << totalSamplesSeen << learnRatePerSample << prevCriterion; fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); + fstream << minibatchSize; + fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) { - const Matrix& smoothedGradient = (*smoothedGradientIter); + const Matrix& smoothedGradient = *smoothedGradientIter; fstream << smoothedGradient; } @@ -1682,12 +1979,14 @@ class SGD: ComputationNetworkHelper fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP"); } - bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, - ElemType& learnRatePerSample, + bool LoadCheckPointInfo(const size_t epochNumber, + /*out*/ size_t& totalSamplesSeen, + /*out*/ ElemType& learnRatePerSample, std::list>& smoothedGradients, - ElemType& prevCriterion) + /*out*/ ElemType& prevCriterion, + /*out*/ size_t& minibatchSize) { - wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); + wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber)); if (!fexists(checkPointFileName.c_str())) { fprintf(stderr, @@ -1703,11 +2002,21 @@ class SGD: ComputationNetworkHelper fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion; fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize")) + { + fstream >> minibatchSize; + fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); + } + else + { + minibatchSize = m_mbSize[epochNumber]; + } + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) { - Matrix& smoothedGradient = (*smoothedGradientIter); + Matrix& smoothedGradient = *smoothedGradientIter; fstream >> smoothedGradient; } fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient"); @@ -1737,12 +2046,12 @@ class SGD: ComputationNetworkHelper } - //return -1 if nothing exists + // return -1 if nothing exists int DetermineStartEpoch(const bool makeMode) { if (!makeMode) { - //always start from scratch + // always start from scratch return -1; } @@ -1880,7 +2189,7 @@ class SGD: ComputationNetworkHelper node->UpdateEvalTimeStamp(); - //use only the first criterion. Is + // use only the first criterion. Is net.ComputeGradient(criterionNodes[npos]); if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE) @@ -1924,7 +2233,7 @@ class SGD: ComputationNetworkHelper node->UpdateEvalTimeStamp(); net.Evaluate(criterionNodes[npos]); - //criterionNode should be a scalar + // criterionNode should be a scalar ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); // back to its orginal parameter value @@ -1971,8 +2280,13 @@ class SGD: ComputationNetworkHelper bool m_needToNormalizeLRByParallUtterance; intargvector m_mbSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; + + // the total number of epochs to run. size_t m_maxEpochs; + floatargvector m_momentumInputPerMB; ElemType m_momentumPerMB; bool m_gradientClippingWithTruncation; @@ -1995,12 +2309,16 @@ class SGD: ComputationNetworkHelper ElemType m_reduceLearnRateIfImproveLessThan; bool m_continueReduce; - //determine after how many epochs the learning rate should be auto adjusted. + // determine after how many epochs the learning rate should be auto adjusted. size_t m_learnRateAdjustInterval; ElemType m_increaseLearnRateIfImproveMoreThan; ElemType m_learnRateIncreaseFactor; ElemType m_learnRateDecreaseFactor; + size_t m_prevChosenMinibatchSize; + bool m_autoAdjustMinibatch; + size_t m_minibatchSizeTuningFrequency; + size_t m_minibatchSizeTuningMax; floatargvector m_dropoutRates; size_t m_maxTempMemSizeInSamplesForCNN; diff --git a/MachineLearning/CNTK/TrainingCriterionNodes.h b/MachineLearning/CNTK/TrainingCriterionNodes.h index 65cb9ccb5f81..8ccdf51e08cb 100644 --- a/MachineLearning/CNTK/TrainingCriterionNodes.h +++ b/MachineLearning/CNTK/TrainingCriterionNodes.h @@ -978,18 +978,27 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void EvaluateThisNode() //-sum(left_i * log(softmax_i(right))) { - if (m_evalMode == NCEEvalMode::Softmax || Inputs(0)->FunctionValues().GetNumRows() == 1) + int positive = 0, negative = 0; + if (Inputs(0)->FunctionValues().GetNumRows() == 1) + { + for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++) + { + if (Inputs(0)->FunctionValues()(0, i) > 0) + positive++; + else if (Inputs(0)->FunctionValues()(0, i) < 0) + negative++; + } + assert(positive * negative == 0); + } + if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0)) { // evaluation uses softmax m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false); m_logSoftmax += Inputs(3)->FunctionValues(); m_logSoftmax.InplaceLogSoftmax(false); - FunctionValues().Resize(1, 1); - FunctionValues().SetValue(0); - for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++) - FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i)); + FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax); } - else if (m_evalMode == NCEEvalMode::Unnormalized) + else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0)) { FunctionValues().AssignNceUnnormalizedEval(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues()); } diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp index 0f6ac7a76b20..3f055ee8e9cf 100644 --- a/Math/Math/CPUMatrix.cpp +++ b/Math/Math/CPUMatrix.cpp @@ -3881,6 +3881,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { { return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c); } + template + void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& softmax, CPUMatrix& c) + { + ElemType log_likelihood = 0.0; + size_t batch_size = this->GetNumCols(); +#pragma omp parallel for reduction(+:log_likelihood) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + { + int sample = (int)(*this)(0, instance_id); + log_likelihood += softmax(instance_id, sample); + } + c(0, 0) = -log_likelihood; + } template void CPUMatrix::AssignNCEUnnormalizedEval(const CPUMatrix& a, diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h index 48cf20cb0a30..5e5bbad27931 100644 --- a/Math/Math/CPUMatrix.h +++ b/Math/Math/CPUMatrix.h @@ -217,6 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void AssignNoiseContrastiveEstimation(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& tmp, CPUMatrix& c); + + void AssignSoftmaxSum(const CPUMatrix& a, CPUMatrix& softmax); void AssignNCEUnnormalizedEval(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& c); diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 5baaba711d68..10a995153256 100644 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -1957,7 +1957,27 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } - + template + void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& c) + { + UNCONST(ElemType, a, my_a); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + int p = 512; + int width = a.GetNumRows(); + while (p / 2 > width) p = p / 2; + + _assignSoftmaxSum << <1, p >> >( + my_a.GetArray(), + width, + GetArray(), + c.GetArray() + ); + + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); + } template void GPUMatrix::AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) { diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h index f307d9c1cf3a..bd5f9976d423 100644 --- a/Math/Math/GPUMatrix.h +++ b/Math/Math/GPUMatrix.h @@ -295,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t sampleCount, GPUMatrix& tmp, GPUMatrix& c); void AssignNCEDerivative(GPUMatrix& tmp, const GPUMatrix& a, const GPUMatrix& b, size_t inputIndex, GPUMatrix& c); void AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); - + void AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& softmax); void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const; void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index ec1a2e36ab11..b4c5eced6b30 100644 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -2932,6 +2932,59 @@ __global__ void _computeNceOutput( } } + +template +__global__ void _assignSoftmaxSum( + const ElemType* softmax, + int sampleCount, + const ElemType* a, + ElemType* c) // run on 512 threads per block +{ + // val and col are in CSR format + // val is an array contains log_Pn(w). To differentiate positive and negative samples, + // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples + // col is an array contains index of the word samples + // a is a matrix in column major format contains output from hidden layer + // b is the weight matrix for output layer + // tmp is the buffer that stores NCE output calculated from _computeNceOutput + // c is the matrix to store objective + + __shared__ ElemType partials[512]; + partials[threadIdx.x] = 0; + + int total = sampleCount; + int loadPerThread = (total + blockDim.x - 1) / blockDim.x; + + // find out the items this thread is responsible for + int start = loadPerThread * threadIdx.x; + int end = min(total, loadPerThread * (threadIdx.x + 1)); + for (int i = start; i < end; i++) + { + int wid = (int)a[i]; + partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)]; + } + + __syncthreads(); + + // now sum up the objective function + int nTotalThreads = blockDim.x; + + while (nTotalThreads >1) + { + int halfPoint = (nTotalThreads >> 1); + + if (threadIdx.x < halfPoint) + partials[threadIdx.x] += partials[threadIdx.x + halfPoint]; + + __syncthreads(); + + nTotalThreads = (nTotalThreads >> 1); + } + + if (threadIdx.x == 0) + c[0] = -partials[0]; +} + template __global__ void _assignNoiseContrastiveEstimation( const ElemType* val, diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index 3c98d74aaa78..73365ccf6099 100644 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { #define NUM_MATRIXTYPE_CHANGED_WARN 20 m_numTimesMatrixTypeChanged++; + if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN) - fprintf(stderr, "WARNING: The same matrix with dim [%d, %d] has been transferred between different devices for %d times.\n", GetNumRows(), GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN); - + { + fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN); + } if (GetDeviceId()<0) //CPU { if (newMatrixType==MatrixType::SPARSE) @@ -1241,14 +1243,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (numRows != GetNumRows() || numCols != GetNumCols()) { - DISPATCH_MATRIX_ON_FLAG(this, - this, - m_CPUMatrix->Reshape(numRows, numCols), - m_GPUMatrix->Reshape(numRows, numCols), - NOT_IMPLEMENTED, - NOT_IMPLEMENTED - ); - } + DISPATCH_MATRIX_ON_FLAG(this, + this, + m_CPUMatrix->Reshape(numRows, numCols), + m_GPUMatrix->Reshape(numRows, numCols), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED + ); + } } template @@ -3667,6 +3669,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { return *this; } + + template + Matrix& Matrix::AssignSoftmaxSum(const Matrix& a, const Matrix& softmax) + { + this->Resize(1, 1); + if (this->GetDeviceId() < 0) + a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix); + else + a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix); + return *this; + } + template Matrix& Matrix::AssignNceUnnormalizedEval(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias) { @@ -4454,7 +4468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - // Matrix& Matrix::Shift(const Matrix& a, size_t shift) + //Matrix& Matrix::Shift(const Matrix& a, size_t shift) //[this]= (a right shift by n), padded with zeros // shift left, shift needs to be negative value // shift right, shift needs to be positive value diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h index 8a6663e56fc8..1f6cc2ed65dc 100644 --- a/Math/Math/Matrix.h +++ b/Math/Math/Matrix.h @@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix& AssignNoiseContrastiveEstimation(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias, Matrix& tmp); Matrix& AssignNCEDerivative(const Matrix& tmp, const Matrix& a, const Matrix& b, const Matrix& c, size_t inputIndex); - + Matrix& AssignSoftmaxSum(const Matrix& a, const Matrix& softmax); Matrix& AssignNceUnnormalizedEval(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias); Matrix Transpose(); // This method doesn't change state of Matrix. It should be a const function diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp index 888312599133..b24d33f20a70 100644 --- a/Math/Math/NoGPU.cpp +++ b/Math/Math/NoGPU.cpp @@ -1070,6 +1070,10 @@ namespace Microsoft { } + template + void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& c) + { + } template void GPUMatrix::AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c)