diff --git a/Source/ActionsLib/ActionsLib.vcxproj.filters b/Source/ActionsLib/ActionsLib.vcxproj.filters
index 38cc00b65315..030da8855a28 100644
--- a/Source/ActionsLib/ActionsLib.vcxproj.filters
+++ b/Source/ActionsLib/ActionsLib.vcxproj.filters
@@ -13,15 +13,15 @@
Actions
-
- Actions
-
Actions
Actions
+
+ Actions
+
diff --git a/Source/CNTK/CNTK.vcxproj b/Source/CNTK/CNTK.vcxproj
index cf2ca3d3b392..df1ad3f7ce93 100644
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@@ -174,8 +174,6 @@
-
-
diff --git a/Source/CNTK/CNTK.vcxproj.filters b/Source/CNTK/CNTK.vcxproj.filters
index e00025801910..e794c5e10eb1 100644
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@@ -175,9 +175,6 @@
from ComputationNetworkLib\Network
-
- from ComputationNetworkLib\Network
-
from ComputationNetworkLib\Nodes
@@ -187,9 +184,6 @@
from ComputationNetworkLib\Nodes
-
- from ComputationNetworkLib\Network
-
from ComputationNetworkLib\Nodes
diff --git a/Source/SGDLib/IComputationNetBuilder.h b/Source/SGDLib/IComputationNetBuilder.h
deleted file mode 100644
index 2dabf55e3a3f..000000000000
--- a/Source/SGDLib/IComputationNetBuilder.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#if 1 // only needed for some unused code in MultiNetworksSGD.h
-//
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//
-#pragma once
-
-#include "ComputationNetwork.h"
-#include
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-// This interface provides only one method: BuildNetworkFromDescription().
-// There are two variants currently:
-// - SimpleNetworkBuilder: standard networks built from a few parameters
-// - NDLNetworkBuilder: networks built using the old CNTK NDL
-// The use of this interface is very local (eventually it will be local to DoTrain() only), so there will no longer be a need to even have this interface.
-// Models created through BrainScript (or Python) do not go through this interface.
-
-template
-/*interface*/ struct IComputationNetBuilder
-{
- virtual ComputationNetworkPtr BuildNetworkFromDescription(ComputationNetwork* = nullptr) = 0;
- virtual ~IComputationNetBuilder(){};
-};
-}
-}
-}
-#endif
diff --git a/Source/SGDLib/MultiNetworksEvaluator.h b/Source/SGDLib/MultiNetworksEvaluator.h
deleted file mode 100644
index 2b9604d295c5..000000000000
--- a/Source/SGDLib/MultiNetworksEvaluator.h
+++ /dev/null
@@ -1,1085 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// MultiNetworksEvaluator/SGD -- This represents earlier efforts to use CNTK for sequence-to-sequence modeling. This is no longer the intended design.
-//
-#pragma once
-
-#include "Basics.h"
-#include "Helpers.h" // for foreach_column() macro
-#include "fileutil.h"
-#include "DataReader.h"
-#include "DataWriter.h"
-#include "ComputationNetwork.h"
-#include "DataReaderHelpers.h"
-#include "SimpleEvaluator.h"
-#include "TrainingCriterionNodes.h" // TODO: we should move the functions that depend on these to the .cpp
-#include "CompositeComputationNodes.h"
-#include
-#include
-#include
-#include
-#include
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-template
-struct NN_state
-{
- map> hidden_activity;
-};
-
-template
-struct Token
-{
- Token(const double score, const std::vector& sequence, const NN_state& state)
- : score(score), sequence(sequence), state(state)
- {
- }
- bool operator<(const Token& t) const
- {
- return score < t.score;
- }
- double score;
- vector sequence;
- NN_state state;
-};
-
-template
-class MultiNetworksEvaluator : public SimpleEvaluator
-{
- typedef SimpleEvaluator Base;
- using Base::m_net;
- using Base::m_numMBsToShowResult;
- using Base::m_traceLevel;
- using Base::DisplayEvalStatistics;
- typedef shared_ptr> ComputationNodePtr;
- typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr;
-
-public:
- MultiNetworksEvaluator(ComputationNetworkPtr net, const size_t numMBsToShowResult = 100, const int traceLevel = 0)
- : Base(net, numMBsToShowResult, traceLevel)
- {
- }
-
- //returns error rate
- // This was a special early implementation of RNNs by emulating them as a DNN.
- // The code is very restricted to simple RNNs.
- // The idea can be used for more complicated network but need to know which nodes are stateful or time-dependent so that unroll is done in a correct way to represent recurrent networks.
- // TODO: can probably be removed.
- double EvaluateUnroll(IDataReader* dataReader, const size_t mbSize, double& evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
- {
- std::vector& featureNodes = m_net->FeatureNodes();
- std::vector& labelNodes = m_net->LabelNodes();
- std::vector& criterionNodes = m_net->FinalCriterionNodes();
- std::vector& evaluationNodes = m_net->EvaluationNodes();
-
- if (criterionNodes.size() == 0)
- RuntimeError("No CrossEntropyWithSoftmax node found\n");
- if (evaluationNodes.size() == 0)
- RuntimeError("No Evaluation node found\n");
-
- std::map*> inputMatrices;
- for (size_t i = 0; i < featureNodes.size(); i++)
- inputMatrices[featureNodes[i]->NodeName()] = &dynamic_pointer_cast>(featureNodes[i])->Value();
- for (size_t i = 0; i < labelNodes.size(); i++)
- inputMatrices[labelNodes[i]->NodeName()] = &dynamic_pointer_cast>(labelNodes[i])->Value();
- inputMatrices[L"numberobs"] = new Matrix(1, 1, m_net->GetDeviceId());
-
- dataReader->StartMinibatchLoop(mbSize, 0, testSize);
- m_net->StartEvaluateMinibatchLoop(criterionNodes, evaluationNodes);
-
- double epochEvalError = 0;
- double epochCrossEntropy = 0;
- size_t totalEpochSamples = 0;
- double prevEpochEvalError = 0;
- double prevEpochCrossEntropy = 0;
- size_t prevTotalEpochSamples = 0;
- size_t prevStart = 1;
- size_t numSamples = 0;
- double crossEntropy = 0;
- double evalError = 0;
-
- ofstream outputStream;
- if (output)
- {
-#ifdef _MSC_VER
- outputStream.open(output);
-#else
- outputStream.open(wtocharpath(output).c_str()); // GCC does not implement wide-char pathnames here
-#endif
- }
-
- size_t numMBsRun = 0;
- size_t actualMBSize = 0;
- while (dataReader->GetMinibatch(inputMatrices))
- {
- // TODO: we should use GetMinibatchIntoNetwork(), but it seems tricky. What is this for?
- size_t nbrSamples = (size_t) (*inputMatrices[L"numberobs"])(0, 0);
- actualMBSize = nbrSamples;
-
- for (int npos = 0; npos < nbrSamples; npos++)
- {
- featureNodes[npos]->BumpEvalTimeStamp();
- labelNodes[npos]->BumpEvalTimeStamp();
-
- m_net->ForwardProp(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more?
-
- m_net->ForwardProp(evaluationNodes[npos]);
-
- double mbCrossEntropy = (double) criterionNodes[npos]->Get00Element(); // criterionNode should be a scalar
- epochCrossEntropy += mbCrossEntropy;
-
- double mbEvalError = (double) evaluationNodes[npos]->Get00Element(); //criterionNode should be a scalar
-
- epochEvalError += mbEvalError;
- }
-
- totalEpochSamples += actualMBSize;
-
- if (outputStream.is_open())
- {
- //TODO: add support to dump multiple outputs
- ComputationNodePtr outputNode = dynamic_pointer_cast>(m_net->OutputNodes()[0]);
- foreach_column (j, outputNode->Value())
- {
- foreach_row (i, outputNode->Value())
- outputStream << outputNode->Value()(i, j) << " ";
- outputStream << endl;
- }
- }
-
- numMBsRun++;
- if (numMBsRun % m_numMBsToShowResult == 0)
- {
- numSamples = (totalEpochSamples - prevTotalEpochSamples);
- crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
- evalError = epochEvalError - prevEpochEvalError;
-
- fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n",
- prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
-
- prevTotalEpochSamples = totalEpochSamples;
- prevEpochCrossEntropy = epochCrossEntropy;
- prevEpochEvalError = epochEvalError;
- prevStart = numMBsRun + 1;
- }
- }
-
- // show final grouping of output
- numSamples = totalEpochSamples - prevTotalEpochSamples;
- if (numSamples > 0)
- {
- crossEntropy = epochCrossEntropy - prevEpochCrossEntropy;
- evalError = epochEvalError - prevEpochEvalError;
- fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n",
- prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples);
- }
-
- //final statistics
- epochEvalError /= (double) totalEpochSamples;
- epochCrossEntropy /= (double) totalEpochSamples;
- fprintf(stderr, "Overall: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy);
- if (outputStream.is_open())
- {
- outputStream.close();
- }
- evalSetCrossEntropy = epochCrossEntropy;
- return epochEvalError;
- }
-
-public:
- /// for encoder-decoder RNN
- list> m_lst_pair_encoder_decode_node_names;
- list> m_lst_pair_encoder_decoder_nodes;
-
- void SetEncoderDecoderNodePairs(std::list>& lst_pair_encoder_decoder_nodes)
- {
- m_lst_pair_encoder_decoder_nodes.clear();
- for (typename std::list>::iterator iter = lst_pair_encoder_decoder_nodes.begin(); iter != lst_pair_encoder_decoder_nodes.end(); iter++)
- m_lst_pair_encoder_decoder_nodes.push_back(*iter);
- }
-
- /**
- this evaluates encoder network and decoder framework
- only beam search decoding is applied to the last network
- */
- double EvaluateEncoderDecoderWithHiddenStates(
- vector nets,
- vector*> dataReaders,
- const size_t mbSize,
- const size_t testSize = requestDataSize)
- {
- size_t iNumNets = nets.size();
-
- ComputationNetworkPtr decoderNet = nullptr;
- IDataReader* decoderDataReader = dataReaders[iNumNets - 1];
- decoderNet = nets[iNumNets - 1];
-
- const auto& decoderEvaluationNodes = decoderNet->EvaluationNodes();
-
- double evalResults = 0;
-
- vector*>*> inputMatrices;
- for (auto ptr = nets.begin(); ptr != nets.end(); ptr++)
- {
- const auto& featNodes = (*ptr)->FeatureNodes();
- const auto& lablPtr = (*ptr)->LabelNodes();
- map*>* pMap = new map*>();
- for (auto pf = featNodes.begin(); pf != featNodes.end(); pf++)
- {
- (*pMap)[(*pf)->NodeName()] = &dynamic_pointer_cast>(*pf)->Value();
- }
- for (auto pl = lablPtr.begin(); pl != lablPtr.end(); pl++)
- {
- (*pMap)[(*pl)->NodeName()] = &(dynamic_pointer_cast>(*pl)->Value());
- }
- inputMatrices.push_back(pMap);
- }
-
- //evaluate through minibatches
- size_t totalEpochSamples = 0;
- size_t numMBsRun = 0;
- size_t actualMBSize = 0;
- size_t numSamplesLastMBs = 0;
- size_t lastMBsRun = 0; //MBs run before this display
-
- double evalResultsLastMBs = (double) 0;
-
- for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++)
- (*ptr)->StartMinibatchLoop(mbSize, 0, testSize);
- // BUGBUG: Code below will fail because we now must call StartMinibatchLoop(), but I can't tell from below which nodes to call it for.
- //for (auto & ptr : nets)
- // ptr->StartMinibatchLoop(xxx);
-
- bool bContinueDecoding = true;
- while (bContinueDecoding)
- {
-
- /// load data
- auto pmat = inputMatrices.begin();
- bool bNoMoreData = false;
- for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++, pmat++)
- {
- if ((*ptr)->GetMinibatch(*(*pmat)) == false)
- {
- bNoMoreData = true;
- break;
- }
- }
- if (bNoMoreData)
- break;
-
- for (auto ptr = nets.begin(); ptr != nets.end(); ptr++)
- {
- const auto& featNodes = (*ptr)->FeatureNodes();
- ComputationNetwork::BumpEvalTimeStamp(featNodes);
- }
-
- auto preader = dataReaders.begin();
- for (auto ptr = nets.begin(); ptr != nets.end(); ptr++, preader++)
- {
- actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
- if (actualMBSize == 0)
- LogicError("decoderTrainSetDataReader read data but encoderNet reports no data read");
- (*preader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
- (*ptr)->VerifyActualNumParallelSequences((*preader)->GetNumParallelSequences());
-
- const auto& pairs = (*ptr)->PairNodes();
- for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++)
- (*ptr)->ForwardProp(*ptr2);
- }
-
- decoderNet = nets[iNumNets - 1];
- /// not the sentence begining, because the initial hidden layer activity is from the encoder network
- actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures();
- if (actualMBSize == 0)
- LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read");
- decoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
- decoderNet->VerifyActualNumParallelSequences(decoderDataReader->GetNumParallelSequences());
-
- size_t i = 0;
- assert(decoderEvaluationNodes.size() == 1);
- if (decoderEvaluationNodes.size() != 1)
- {
- LogicError("Decoder should have only one evaluation node");
- }
-
- for (auto ptr = decoderEvaluationNodes.begin(); ptr != decoderEvaluationNodes.end(); ptr++, i++)
- {
- decoderNet->ForwardProp(*ptr);
- if ((*ptr)->GetSampleLayout().GetNumElements() != 1)
- LogicError("EvaluateEncoderDecoderWithHiddenStates: decoder evaluation should return a scalar value");
-
- evalResults += (double) (*ptr)->Get00Element();
- }
-
- totalEpochSamples += actualMBSize;
- numMBsRun++;
-
- if (m_traceLevel > 0)
- {
- numSamplesLastMBs += actualMBSize;
-
- if (numMBsRun % m_numMBsToShowResult == 0)
- {
- DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, decoderEvaluationNodes, evalResults, evalResultsLastMBs);
-
- evalResultsLastMBs = evalResults;
-
- numSamplesLastMBs = 0;
- lastMBsRun = numMBsRun;
- }
- }
-
- /// call DataEnd to check if end of sentence is reached
- /// datareader will do its necessary/specific process for sentence ending
- for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++)
- {
- (*ptr)->DataEnd(endDataSentence);
- }
- }
-
- // show last batch of results
- if (m_traceLevel > 0 && numSamplesLastMBs > 0)
- {
- DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, decoderEvaluationNodes, evalResults, evalResultsLastMBs);
- }
-
- //final statistics
- evalResultsLastMBs = 0;
-
- fprintf(stderr, "Final Results: ");
- DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, decoderEvaluationNodes, evalResults, evalResultsLastMBs, true);
-
- evalResults /= totalEpochSamples;
-
- for (auto ptr = inputMatrices.begin(); ptr != inputMatrices.end(); ptr++)
- {
- delete *ptr;
- }
-
- return evalResults;
- }
-
- // TODO: This stuff must all be removed from SimpleEvaluator, as this is not simple at all!!
- void InitTrainEncoderDecoderWithHiddenStates(const ConfigParameters& readerConfig)
- {
- ConfigArray arrEncoderNodeNames = readerConfig(L"encoderNodes", "");
- vector encoderNodeNames;
-
- m_lst_pair_encoder_decode_node_names.clear();
- ;
-
- if (arrEncoderNodeNames.size() > 0)
- {
- /// newer code that explicitly place multiple streams for inputs
- foreach_index (i, arrEncoderNodeNames) // inputNames should map to node names
- {
- wstring nodeName = arrEncoderNodeNames[i];
- encoderNodeNames.push_back(nodeName);
- }
- }
-
- ConfigArray arrDecoderNodeNames = readerConfig(L"decoderNodes", "");
- vector decoderNodeNames;
- if (arrDecoderNodeNames.size() > 0)
- {
- /// newer code that explicitly place multiple streams for inputs
- foreach_index (i, arrDecoderNodeNames) // inputNames should map to node names
- {
- wstring nodeName = arrDecoderNodeNames[i];
- decoderNodeNames.push_back(nodeName);
- }
- }
-
- assert(encoderNodeNames.size() == decoderNodeNames.size());
-
- for (size_t i = 0; i < encoderNodeNames.size(); i++)
- {
- m_lst_pair_encoder_decode_node_names.push_back(make_pair(encoderNodeNames[i], decoderNodeNames[i]));
- }
- }
-
- void EncodingEvaluateDecodingBeamSearch(
- vector nets,
- vector*> readers,
- IDataWriter& dataWriter,
- const vector& evalNodeNames,
- const vector& writeNodeNames,
- const size_t mbSize, const double beam, const size_t testSize)
- {
- size_t iNumNets = nets.size();
- if (iNumNets < 2)
- {
- LogicError("Has to have at least two networks");
- }
-
- ComputationNetworkPtr decoderNet = nets[iNumNets - 1];
- IDataReader* encoderDataReader = readers[iNumNets - 2];
- IDataReader* decoderDataReader = readers[iNumNets - 1];
- vector& decoderFeatureNodes = decoderNet->FeatureNodes();
-
- //specify output nodes and files
- std::vector outputNodes;
- for (auto ptr = evalNodeNames.begin(); ptr != evalNodeNames.end(); ptr++)
- outputNodes.push_back(decoderNet->GetNodeFromName(*ptr));
-
- //specify nodes to write to file
- std::vector writeNodes;
- for (int i = 0; i < writeNodeNames.size(); i++)
- writeNodes.push_back(m_net->GetNodeFromName(writeNodeNames[i]));
-
- //prepare features and labels
- std::map*> inputMatrices;
- std::map*> decoderInputMatrices;
- for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++)
- {
- const auto& featNodes = (*ptr)->FeatureNodes();
- for (auto ptr2 = featNodes.begin(); ptr2 != featNodes.end(); ptr2++)
- inputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value();
-
- const auto& lablNodes = (*ptr)->LabelNodes();
- for (auto ptr2 = lablNodes.begin(); ptr2 != lablNodes.end(); ptr2++)
- inputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value();
- }
-
- /// for the last network
- auto ptr = nets.end() - 1;
- const auto& featNodes = (*ptr)->FeatureNodes();
- for (auto ptr2 = featNodes.begin(); ptr2 != featNodes.end(); ptr2++)
- decoderInputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value();
-
- const auto& lablNodes = (*ptr)->LabelNodes();
- for (auto ptr2 = lablNodes.begin(); ptr2 != lablNodes.end(); ptr2++)
- decoderInputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value();
-
- //evaluate through minibatches
- size_t totalEpochSamples = 0;
- size_t actualMBSize = 0;
-
- for (auto ptr = readers.begin(); ptr != readers.end(); ptr++)
- {
- (*ptr)->StartMinibatchLoop(mbSize, 0, testSize);
- (*ptr)->SetNumParallelSequences(1);
- }
-
- Matrix historyMat(m_net->GetDeviceId());
-
- bool bDecoding = true;
- while (bDecoding)
- {
- bool noMoreData = false;
- /// only get minibatch on the encoder parts of networks
- size_t k = 0;
- for (auto ptr = readers.begin(); ptr != readers.end() - 1; ptr++, k++)
- {
- if ((*ptr)->GetMinibatch(inputMatrices) == false)
- {
- noMoreData = true;
- break;
- }
- }
- if (noMoreData)
- break;
-
- for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++)
- {
- /// only on the encoder part of the networks
- const auto& featNodes = (*ptr)->FeatureNodes();
- ComputationNetwork::BumpEvalTimeStamp(featNodes);
- }
-
- auto ptrreader = readers.begin();
- size_t mNutt = 0;
- for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++, ptrreader++)
- {
- /// evaluate on the encoder networks
- actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures();
-
- mNutt = (*ptrreader)->GetNumParallelSequences();
- (*ptrreader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr());
- (*ptr)->VerifyActualNumParallelSequences(mNutt);
-
- const auto& pairs = (*ptr)->PairNodes();
- for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++)
- (*ptr)->ForwardProp(*ptr2);
- }
-
-/// not the sentence begining, because the initial hidden layer activity is from the encoder network
-//decoderNet->ResizeAllFeatureNodes(actualMBSize); // BUGBUG: Function was deleted, but this may be necessary.
-#if 0 // What this ^^ used to be:
- // only called from MultiNetworksEvaluator
- // a helper function for some places that like to hack the features directly
- // This is for a few places (FindBestPath stuff) that don't follow the normal pattern but instead called the old SetFeaturesMiniBatchSize() function with a value of their choosing.
- // This is now changed in that they must actually resize the features, and then the system takes it from here.
- // UNTESTED stopgap. Most likely places that are never used.
- // This function does not actually allocate the matrices. I don't know whether that currently happens correctly.
- void ResizeAllFeatureNodes(size_t cols)
- {
- auto & featureNodes = FeatureNodes();
- for (auto & nodeIter : featureNodes)
- nodeIter->SetNumCols(cols);
- }
-
-#endif
- //decoderNet->SetActualMiniBatchSizeFromFeatures();
- encoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr());
- decoderNet->VerifyActualNumParallelSequences(mNutt);
-
- vector best_path;
- FindBestPathWithVariableLength(decoderNet, actualMBSize, decoderDataReader, dataWriter, outputNodes, writeNodes, decoderFeatureNodes, beam, &decoderInputMatrices, best_path);
-
- totalEpochSamples += actualMBSize;
-
- /// call DataEnd to check if end of sentence is reached
- /// datareader will do its necessary/specific process for sentence ending
- for (auto ptr = readers.begin(); ptr != readers.end(); ptr++)
- (*ptr)->DataEnd(endDataSentence);
- }
- }
-
- template
- static inline bool comparator(const pair& l, const pair& r)
- {
- return l.second > r.second;
- }
-
- bool GetCandidatesAtOneTimeInstance(const Matrix& score,
- const double& preScore, const double& threshold,
- const double& best_score_so_far,
- vector>& rCandidate)
- {
- Matrix ptrScore(CPUDEVICE);
- ptrScore = score;
-
- ElemType* pPointer = ptrScore.BufferPointer();
- vector> tPairs;
- for (int i = 0; i < ptrScore.GetNumElements(); i++)
- {
- tPairs.push_back(make_pair(i, pPointer[i]));
- // assert(pPointer[i] <= 1.0); /// work on the posterior probabilty, so every score should be smaller than 1.0
- }
-
- std::sort(tPairs.begin(), tPairs.end(), comparator);
-
- bool bAboveThreshold = false;
- for (typename vector>::iterator itr = tPairs.begin(); itr != tPairs.end(); itr++)
- {
- if (itr->second < 0.0)
- LogicError("This means to use probability so the value should be non-negative");
-
- double dScore = (itr->second > (double) EPS_IN_LOG) ? log(itr->second) : (double) LOG_OF_EPS_IN_LOG;
-
- dScore += preScore;
- if (dScore >= threshold && dScore >= best_score_so_far)
- {
- rCandidate.push_back(make_pair(itr->first, dScore));
- bAboveThreshold = true;
- }
- else
- {
- break;
- }
- }
-
- return bAboveThreshold;
- }
-
- // retrieve activity at time atTime.
- // notice that the function values returned is single column
- void PreComputeActivityAtTime(size_t atTime)
- {
- for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
- {
- ComputationNodeBasePtr node = *nodeIter;
- node->ForwardProp(FrameRange(node->GetMBLayout(), atTime));
- if (node->GetSampleMatrixNumCols() != node->GetNumParallelSequences())
- RuntimeError("preComputeActivityAtTime: the function values has to be a single column matrix ");
- }
- }
-
- // (only called by FindBestPath...())
- void ResetPreCompute()
- {
- //mark false
- for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
- {
- auto node = static_pointer_cast>(*nodeIter);
- node->MarkComputed(false);
- }
- }
-
- //return true if precomputation is executed.
- bool EvaluateBatchModeNodes(ComputationNetwork& net,
- const std::vector& featureNodes)
- {
- batchComputeNodes = net.GetNodesRequiringBatchMode();
-
- if (batchComputeNodes.size() == 0)
- {
- return false;
- }
-
- ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-
- net.StartEvaluateMinibatchLoop(batchComputeNodes); // TODO: Is this correct? There is no StartMinibatchLoop() for a reader.
-
- //net.SetActualMiniBatchSizeFromFeatures();
- for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
- net.ForwardProp(*nodeIter);
-
- //mark done
- for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
- {
- auto node = static_pointer_cast>(*nodeIter);
- node->MarkComputed(true);
- }
-
- return true;
- }
-
- void WriteNbest(const size_t nidx, const vector& best_path,
- const std::vector& outputNodes, IDataWriter& dataWriter)
- {
- assert(outputNodes.size() == 1);
- std::map outputMatrices;
- size_t bSize = best_path.size();
- for (int i = 0; i < outputNodes.size(); i++)
- {
-#if 0 // This call no longer exists. This must be updated to make it functional again.
- outputNodes[i]->SetNumCols(bSize);
-#endif
- dynamic_pointer_cast>(outputNodes[i])->UpdateFunctionValuesSize();
- dynamic_pointer_cast>(outputNodes[i])->Value().SetValue(0);
- for (int k = 0; k < bSize; k++)
- dynamic_pointer_cast>(outputNodes[i])->Value().SetValue(best_path[k], k, 1.0);
- outputMatrices[outputNodes[i]->NodeName()] = (void*) (&dynamic_pointer_cast>(outputNodes[i])->Value());
- // TODO: void* --really?
- }
-
- dataWriter.SaveData(nidx, outputMatrices, bSize, bSize, 0);
- }
-
- void BeamSearch(IDataReader* dataReader, IDataWriter& dataWriter, const vector& outputNodeNames, const vector& writeNodeNames, const size_t mbSize, const double beam, const size_t testSize)
- {
- clock_t startReadMBTime = 0, endComputeMBTime = 0;
-
- //specify output nodes and files
- std::vector outputNodes;
- for (int i = 0; i < outputNodeNames.size(); i++)
- outputNodes.push_back(m_net->GetNodeFromName(outputNodeNames[i]));
-
- //specify nodes to write to file
- std::vector writeNodes;
- for (int i = 0; i < writeNodeNames.size(); i++)
- writeNodes.push_back(m_net->GetNodeFromName(writeNodeNames[i]));
-
- //prepare features and labels
- /*const*/ auto& featureNodes = m_net->FeatureNodes();
- const auto& labelNodes = m_net->LabelNodes();
-
- std::map*> inputMatrices;
- for (size_t i = 0; i < featureNodes.size(); i++)
- inputMatrices[featureNodes[i]->NodeName()] = &dynamic_pointer_cast>(featureNodes[i])->Value();
- for (size_t i = 0; i < labelNodes.size(); i++)
- inputMatrices[labelNodes[i]->NodeName()] = &dynamic_pointer_cast>(labelNodes[i])->Value();
-
- //evaluate through minibatches
- size_t totalEpochSamples = 0;
- size_t actualMBSize = 0;
-
- dataReader->StartMinibatchLoop(mbSize, 0, testSize);
- dataReader->SetNumParallelSequences(1);
-
- startReadMBTime = clock();
- size_t numMBsRun = 0;
- double ComputeTimeInMBs = 0;
- while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, false, inputMatrices, actualMBSize))
- {
- // note: GetMinibatchIntoNetwork() will also fetch the MBLayout although we don't need ithere. This should not hurt.
- ComputationNetwork::BumpEvalTimeStamp(featureNodes);
- //actualMBSize = m_net->SetActualMiniBatchSizeFromFeatures();
-
- vector best_path;
-
- FindBestPath(m_net, dataReader,
- dataWriter, outputNodes,
- writeNodes, featureNodes,
- beam, &inputMatrices, best_path);
-
- totalEpochSamples += actualMBSize;
-
- /// call DataEnd to check if end of sentence is reached
- /// datareader will do its necessary/specific process for sentence ending
- dataReader->DataEnd(endDataSentence);
-
- endComputeMBTime = clock();
- numMBsRun++;
-
- if (m_traceLevel > 0)
- {
- double MBComputeTime = (double) (endComputeMBTime - startReadMBTime) / CLOCKS_PER_SEC;
-
- ComputeTimeInMBs += MBComputeTime;
-
- fprintf(stderr, "Sentences Seen = %zd; Samples seen = %zd; Total Compute Time = %.8g ; Time Per Sample=%.8g\n", numMBsRun, totalEpochSamples, ComputeTimeInMBs, ComputeTimeInMBs / totalEpochSamples);
- }
-
- startReadMBTime = clock();
- }
-
- fprintf(stderr, "done decoding\n");
- }
-
- void FindBestPath(ComputationNetworkPtr evalnet,
- IDataReader* dataReader, IDataWriter& dataWriter,
- const std::vector& evalNodes,
- const std::vector& outputNodes,
- /*const*/ std::vector& featureNodes,
- const double beam,
- std::map*>* inputMatrices,
- vector& best_path)
- {
- assert(evalNodes.size() == 1);
-
- NN_state state;
- NN_state null_state;
-
- priority_queue> n_bests; /// save n-bests
-
- /**
- loop over all the candidates for the featureDelayTarget,
- evaluate their scores, save their histories
- */
- priority_queue> from_queue, to_queue;
- vector evalResults;
-
- /// use reader to initialize evalnet's sentence start information to let it know that this
- /// is the begining of sentence
- size_t mbSize = evalnet->DetermineActualMBSizeFromFeatures();
- dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr());
- evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
-
- size_t maxMbSize = 2 * mbSize;
-
- clock_t start, now;
- start = clock();
-
- /// for the case of not using encoding, no previous state is avaliable, except for the default hidden layer activities
- /// no need to get that history and later to set the history as there are default hidden layer activities
-
- from_queue.push(Token(0., vector(), state)); /// the first element in the priority queue saves the initial NN state
-
- dataReader->InitProposals(inputMatrices);
- size_t itdx = 0;
- size_t maxSize = min(maxMbSize, mbSize);
-
- ResetPreCompute();
- EvaluateBatchModeNodes(*evalnet, featureNodes);
-
-/// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
-/// is the begining of sentence
-#if 0 // This call no longer exists. This must be updated to make it functional again.
- for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++)
- (*ptr)->SetNumCols(1);
-#endif
- // TODO: ^^ this is the same as ResizeAllFeatureNodes() if featureNodes == evalnet.FeatureNodes(). Is it?
- //evalnet->SetActualMiniBatchSizeFromFeatures();
-
- dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr()); // TODO: should this be one column only?
- /// need to set the sentence begining segmentation info
- // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional.
- //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::SequenceStart));
-
- for (itdx = 0; itdx < maxSize; itdx++)
- {
- double best_score = -numeric_limits::infinity();
- vector best_output_label;
-
- if (itdx > 0)
- {
- /// state need to be carried over from past time instance
- // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional. [fseide]
- //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::None));
- }
-
- PreComputeActivityAtTime(itdx);
-
- while (!from_queue.empty())
- {
- const Token from_token = from_queue.top();
- vector history = from_token.sequence;
-
- /// update feature nodes once, as the observation is the same for all propsoals in labels
- ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-
- /// history is updated in the getproposalobs function
- dataReader->GetProposalObs(inputMatrices, itdx, history);
-
- /// get the nn state history and set nn state to the history
- map> hidden_history = from_token.state.hidden_activity;
- evalnet->SetHistory(hidden_history);
-
- for (int i = 0; i < evalNodes.size(); i++)
- {
- evalnet->ForwardProp(evalNodes[i]);
- vector> retPair;
- if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast>(evalNodes[i])->Value(), from_token.score, best_score - beam, -numeric_limits::infinity(), retPair) == false)
- continue;
-
- evalnet->GetHistory(state.hidden_activity, true);
- for (typename vector>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
- {
- vector history = from_token.sequence;
- history.push_back(itr->first);
- Token to_token(itr->second, history, state); /// save updated nn state and history
-
- to_queue.push(to_token);
-
- if (itr->second > best_score) /// update best score
- {
- best_score = itr->second;
- best_output_label = history;
- }
- }
-
- history = from_token.sequence; /// back to the from token's history
- }
-
- from_queue.pop();
- }
-
- if (to_queue.size() == 0)
- break;
-
- // beam pruning
- const double threshold = best_score - beam;
- while (!to_queue.empty())
- {
- if (to_queue.top().score >= threshold)
- from_queue.push(to_queue.top());
- to_queue.pop();
- }
- }
-
- // write back best path
- size_t ibest = 0;
- while (from_queue.size() > 0)
- {
- Token seq(from_queue.top().score, from_queue.top().sequence, from_queue.top().state);
-
- best_path.clear();
-
- assert(best_path.empty());
- best_path = seq.sequence;
- if (ibest == 0)
- WriteNbest(ibest, best_path, outputNodes, dataWriter);
-
-#ifdef DBG_BEAM_SEARCH
- WriteNbest(ibest, best_path, outputNodes, dataWriter);
- cout << " score = " << from_queue.top().score << endl;
-#endif
-
- from_queue.pop();
-
- ibest++;
- }
-
- now = clock();
- fprintf(stderr, "%.1f words per second\n", mbSize / ((double) (now - start) / 1000.0));
- }
-
- /**
- beam search decoder
- */
- double FindBestPathWithVariableLength(ComputationNetworkPtr evalnet,
- size_t inputLength,
- IDataReader* dataReader,
- IDataWriter& dataWriter,
- std::vector& evalNodes,
- std::vector& outputNodes,
- std::vector& featureNodes,
- const double beam,
- std::map*>* inputMatrices,
- vector& best_path)
- {
- assert(evalNodes.size() == 1);
-
- NN_state state;
- NN_state null_state;
-
- std::priority_queue> n_bests; /// save n-bests
-
- /**
- loop over all the candidates for the featuredelayTarget,
- evaluate their scores, save their histories
- */
- std::priority_queue> from_queue, to_queue;
- std::priority_queue> result_queue;
- vector evalResults;
-
- size_t mbSize = inputLength;
- /// use reader to initialize evalnet's sentence start information to let it know that this
- /// is the beginning of sentence
- //evalnet->ResizeAllFeatureNodes(mbSize); // BUGBUG: Function was deleted, but this may be necessary.
- //evalnet->SetActualMiniBatchSizeFromFeatures();
- // TODO: not setting MBLayout?
- evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences());
- // TODO: This is UNTESTED; if it fails, change ^^ this back to SetActual...()
-
- size_t maxMbSize = 3 * mbSize;
-#ifdef _DEBUG
- maxMbSize = 2;
-#endif
-
- clock_t start, now;
- start = clock();
-
- from_queue.push(Token(0., vector(), state)); /// the first element in the priority queue saves the initial NN state
-
- /// the end of sentence symbol in reader
- int outputEOS = dataReader->GetSentenceEndIdFromOutputLabel();
- if (outputEOS < 0)
- LogicError("Cannot find end of sentence symbol. Check ");
-
- dataReader->InitProposals(inputMatrices);
-
- size_t itdx = 0;
-
- ResetPreCompute();
- EvaluateBatchModeNodes(*evalnet, featureNodes);
-
- /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this
- /// is the begining of sentence
- // BUGBUG: This is almost certainly wrong; slice != MB size
- //evalnet->SetActualMiniBatchSize(dataReader->GetNumParallelSequences());
- //evalnet->ResizeAllFeatureNodes(1); // BUGBUG: Function was deleted, but this may be necessary.
- //evalnet->SetActualMiniBatchSizeFromFeatures();
-
- double best_score = -numeric_limits::infinity();
- double best_score_so_far = -numeric_limits::infinity();
-
- // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional.
- //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::SequenceStart)); // BUGBUG: huh? How can the entire batch be start frames?
-
- for (itdx = 0; itdx < maxMbSize; itdx++)
- {
- double best_score = -numeric_limits::infinity();
- vector best_output_label;
-
- if (itdx > 0)
- {
- /// state need to be carried over from past time instance
- // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional.
- //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::None));
- }
-
- PreComputeActivityAtTime(itdx);
-
- while (!from_queue.empty())
- {
- const Token from_token = from_queue.top();
- vector history = from_token.sequence;
-
- /// update feature nodes once, as the observation is the same for all propsoals in labels
- ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-
- /// history is updated in the getproposalobs function
- dataReader->GetProposalObs(inputMatrices, itdx, history);
-
- /// get the nn state history and set nn state to the history
- map> hidden_history = from_token.state.hidden_activity;
- evalnet->SetHistory(hidden_history);
-
- for (int i = 0; i < evalNodes.size(); i++)
- {
- evalnet->ForwardProp(evalNodes[i]);
- vector> retPair;
- if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast>(evalNodes[i])->Value(),
- from_token.score, best_score - beam, -numeric_limits::infinity(), retPair) == false) // ==false??? !(.)?
- continue;
-
- evalnet->GetHistory(state.hidden_activity, true);
- for (typename vector>::iterator itr = retPair.begin(); itr != retPair.end(); itr++)
- {
- vector history = from_token.sequence;
- history.push_back(itr->first);
-
- if (itr->first != outputEOS)
- {
- Token to_token(itr->second, history, state); /// save updated nn state and history
-
- to_queue.push(to_token);
-
- if (itr->second > best_score) /// update best score
- {
- best_score = itr->second;
- best_output_label = history;
- }
- }
- else
- {
- /// sentence ending reached
- Token to_token(itr->second, history, state);
- result_queue.push(to_token);
- }
- }
-
- history = from_token.sequence; /// back to the from token's history
- }
-
- from_queue.pop();
- }
-
- if (to_queue.size() == 0)
- break;
-
- // beam pruning
- const double threshold = best_score - beam;
- while (!to_queue.empty())
- {
- if (to_queue.top().score >= threshold)
- from_queue.push(to_queue.top());
- to_queue.pop();
- }
-
- best_score_so_far = best_score;
- }
-
- // write back best path
- size_t ibest = 0;
- while (result_queue.size() > 0)
- {
- best_path.clear();
- //vector *p = &result_queue.top().sequence;
- assert(best_path.empty());
- best_path.swap(const_cast&>(result_queue.top().sequence));
- {
- double score = result_queue.top().score;
- best_score = score;
- fprintf(stderr, "best[%zd] score = %.4e\t", ibest, score);
- if (best_path.size() > 0)
- WriteNbest(ibest, best_path, outputNodes, dataWriter);
- }
-
- ibest++;
-
- result_queue.pop();
- break; /// only output the top one
- }
-
- now = clock();
- fprintf(stderr, "%.1f words per second\n", mbSize / ((double) (now - start) / 1000.0));
-
- return best_score;
- }
-
-protected:
- /// used for backward directional nodes
- std::list batchComputeNodes;
-};
-} } }
diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
deleted file mode 100644
index 166f757573db..000000000000
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ /dev/null
@@ -1,1269 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// MultiNetworksEvaluator/SGD -- This represents earlier efforts to use CNTK for sequence-to-sequence modeling. This is no longer the intended design.
-//
-#pragma once
-
-// TODO: this cannot be instantiated as a whole (compile error), although some function is called from CNTK.cpp--should be fixed
-
-#include "Basics.h"
-#include "ComputationNetwork.h"
-#include "IComputationNetBuilder.h"
-#include "SimpleEvaluator.h"
-#include "MultiNetworksEvaluator.h"
-#include "DataReader.h"
-#include
-#include
-#include
-#include "fileutil.h"
-#include "Config.h"
-#include
-#include
-#include "TimerUtility.h"
-#include "SGD.h"
-
-using namespace std;
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-extern std::wstring GetEncoderModelNameForEpoch(int epoch, bool b = false);
-extern std::wstring GetDecoderModelNameForEpoch(int epoch, bool b = false);
-
-template
-class MultiNetworksSGD : SGD
-{
- ElemType m_default_activity;
-
- using SGDBase = SGD;
-
-public:
- // TODO: use a macro similar to class ComputeNode
- using SGDBase::m_modelPath;
- using SGDBase::m_maxEpochs;
- using SGDBase::m_doUnitTest;
- using SGDBase::m_learnRateAdjustInterval;
- using SGDBase::m_mbSize;
- using SGDBase::m_momentumParam;
- using SGDBase::m_learningRatesParam;
- using SGDBase::GetLearningRatePerSample;
- using SGDBase::GetMomentumPerSample;
- using SGDBase::m_dropoutRates;
- using SGDBase::m_autoLearnRateSearchType;
- using SGDBase::m_minLearnRate;
- using SGDBase::m_loadBestModel;
- //using SGDBase::m_validateAfterModelReloading;
- using SGDBase::m_continueReduce;
- using SGDBase::m_reduceLearnRateIfImproveLessThan;
- using SGDBase::m_epochSize;
- using SGDBase::m_learnRateDecreaseFactor;
- using SGDBase::m_increaseLearnRateIfImproveMoreThan;
- using SGDBase::m_learnRateIncreaseFactor;
- using SGDBase::m_keepCheckPointFiles;
- using SGDBase::m_doGradientCheck;
- using SGDBase::m_L2RegWeight;
- using SGDBase::m_L1RegWeight;
- using SGDBase::m_needAveMultiplier;
- using SGDBase::m_useNesterovMomentum;
- using SGDBase::m_traceLevel;
- using SGDBase::m_numMBsToShowResult;
- using SGDBase::m_gradientCheckSigDigit;
- using SGDBase::m_prevChosenMinibatchSize;
- using SGDBase::UpdateWeights;
- using SGDBase::GetCheckPointFileNameForEpoch;
- using SGDBase::GetTrainCriterionNodes;
- using SGDBase::GetEvalCriterionNodes;
-
- typedef shared_ptr> ComputationNodePtr;
-
- /// for encoder and decoder nodes pairing
- wstring m_decoderModelPath;
- wstring m_backwardDecoderModelPath;
- wstring m_encoderModelPath;
-
- list> m_lst_pair_encoder_decode_node_names;
- list> m_lst_pair_encoder_decoder_nodes;
-
-public:
- MultiNetworksSGD(const ConfigParameters& configSGD)
- : SGDBase(configSGD)
- {
- }
-
- ~MultiNetworksSGD()
- {
- }
-
- void InitTrainEncoderDecoderWithHiddenStates(const ConfigParameters& readerConfig)
- {
-
- m_decoderModelPath = m_modelPath + L".decoder";
- m_backwardDecoderModelPath = m_modelPath + L".backward.decoder";
- m_encoderModelPath = m_modelPath + L".encoder";
-
- ConfigArray arrEncoderNodeNames = readerConfig(L"encoderNodes", "");
- vector encoderNodeNames;
- m_lst_pair_encoder_decode_node_names.clear();
-
- if (arrEncoderNodeNames.size() > 0)
- {
- /// newer code that explicitly place multiple streams for inputs
- foreach_index (i, arrEncoderNodeNames) // inputNames should map to node names
- {
- wstring nodeName = arrEncoderNodeNames[i];
- encoderNodeNames.push_back(nodeName);
- }
- }
-
- ConfigArray arrDecoderNodeNames = readerConfig(L"decoderNodes", "");
- vector decoderNodeNames;
- if (arrDecoderNodeNames.size() > 0)
- {
- /// newer code that explicitly place multiple streams for inputs
- foreach_index (i, arrDecoderNodeNames) // inputNames should map to node names
- {
- wstring nodeName = arrDecoderNodeNames[i];
- decoderNodeNames.push_back(nodeName);
- }
- }
-
- assert(encoderNodeNames.size() == decoderNodeNames.size());
-
- for (size_t i = 0; i < encoderNodeNames.size(); i++)
- {
- m_lst_pair_encoder_decode_node_names.push_back(make_pair(encoderNodeNames[i], decoderNodeNames[i]));
- fprintf(stderr, "paired %ls <-> %ls\n", encoderNodeNames[i].c_str(), decoderNodeNames[i].c_str());
- }
- }
-
- void EncoderDecoder(vector*> netBuilder, DEVICEID_TYPE deviceId,
- vector*> trainSetDataReader,
- vector*> validationSetDataReader,
- const bool makeMode)
- {
- if (validationSetDataReader.size() == 0)
- InvalidArgument("validation set reader should not be null.");
-
- int startEpoch = DetermineEncoderDecoderStartEpoch(makeMode);
- if (startEpoch == m_maxEpochs)
- {
- fprintf(stderr, "No further training is necessary.\n");
- return;
- }
-
- size_t iNumNetworks = netBuilder.size();
- vector nets;
- ComputationNetworkPtr eachNet;
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1, false, msra::strfun::wstrprintf(L".%d", k));
- fprintf(stderr, "network model FileName=%ls\n", modelFileName.c_str());
- if (startEpoch >= 0)
- fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str());
- if (k == 0)
- {
- eachNet =
- startEpoch < 0 ? netBuilder[k]->BuildNetworkFromDescription() : ComputationNetwork::CreateFromFile(deviceId, modelFileName, FileOptions::fileOptionsBinary, true /*bAllowNoCriterionNode*/);
- nets.push_back(eachNet);
- }
- else
- {
- eachNet =
- startEpoch < 0 ? netBuilder[k]->BuildNetworkFromDescription(nets[k - 1].get()) : ComputationNetwork::CreateFromFile(deviceId, modelFileName, FileOptions::fileOptionsBinary, false /*bAllowNoCriterionNode*/, nets[k - 1].get());
- nets.push_back(eachNet);
- }
- }
-
- startEpoch = max(startEpoch, 0);
-
- if (m_doUnitTest)
- {
- if (nets[iNumNetworks - 1]->UnitTest() == false)
- LogicError("unit test on decoder network not passed");
-
- return;
- }
-
- fprintf(stderr, "start training ...\n");
- TrainEncoderDecoderModel(startEpoch, nets, trainSetDataReader, validationSetDataReader);
- }
-
- //return -1 if nothing exists
- int DetermineEncoderDecoderStartEpoch(const bool makeMode)
- {
- if (!makeMode)
- return -1; //always start from scratch
-
- int firstEpoch = -1;
-
- wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1, false, L".0");
- for (int e = int(m_maxEpochs) - 1; e >= -1; e--)
- {
- const wstring prevEpochFile = GetModelNameForEpoch(e - 1, false, L".0");
-
- if (msra::files::fuptodate(curEpochFile, prevEpochFile, false))
- {
- firstEpoch = size_t(e) + 1;
- break;
- }
- else
- curEpochFile = prevEpochFile;
- }
-
- return firstEpoch;
- }
-
- wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false, wstring ext = L"")
- {
- int epoch1Base = epoch + 1;
- if (epoch1Base == m_maxEpochs || bLastModel)
- return m_modelPath + ext;
- else
- return msra::strfun::wstrprintf(L"%s%s.%d", m_modelPath.c_str(), ext.c_str(), (int) epoch1Base);
- }
-
- void TrainEncoderDecoderModel(int startEpoch, ComputationNetworkPtr encoderNet,
- ComputationNetworkPtr decoderNet,
- IDataReader* encoderTrainSetDataReader,
- IDataReader* decoderTrainSetDataReader,
- IDataReader* encoderValidationSetDataReader,
- IDataReader* decoderValidationSetDataReader)
- {
- std::vector& encoderFeatureNodes = encoderNet->FeatureNodes();
- std::vector& encoderEvaluationNodes = encoderNet->OutputNodes();
-
- std::vector& decoderFeatureNodes = decoderNet->FeatureNodes();
- std::vector& decoderLabelNodes = decoderNet->LabelNodes();
- std::vector& decoderCriterionNodes = GetTrainCriterionNodes(*decoderNet);
- std::vector& decoderEvaluationNodes = GetEvalCriterionNodes(*decoderNet);
-
- std::map *> encoderInputMatrices, decoderInputMatrices;
- for (size_t i = 0; i < encoderFeatureNodes.size(); i++)
- encoderInputMatrices[encoderFeatureNodes[i]->NodeName()] = &dynamic_pointer_cast>(encoderFeatureNodes[i])->Value();
- for (size_t i = 0; i < decoderFeatureNodes.size(); i++)
- decoderInputMatrices[decoderFeatureNodes[i]->NodeName()] = &dynamic_pointer_cast>(decoderFeatureNodes[i])->Value();
- for (size_t i = 0; i < decoderLabelNodes.size(); i++)
- decoderInputMatrices[decoderLabelNodes[i]->NodeName()] = &dynamic_pointer_cast>(decoderLabelNodes[i])->Value();
-
- //initializing weights and gradient holder
- const std::list& encoderLearnableNodes = encoderNet->LearnableParameterNodes(encoderEvaluationNodes[0]); //only one criterion so far TODO: support multiple ones?
- const std::list& decoderLearnableNodes = decoderNet->LearnableParameterNodes(decoderCriterionNodes[0]);
- std::list learnableNodes;
- for (auto nodeIter = encoderLearnableNodes.begin(); nodeIter != encoderLearnableNodes.end(); nodeIter++)
- learnableNodes.push_back(*nodeIter);
- for (auto nodeIter = decoderLearnableNodes.begin(); nodeIter != decoderLearnableNodes.end(); nodeIter++)
- learnableNodes.push_back(*nodeIter);
-
- std::list> smoothedGradients;
-#if 0 // No longer functional due to lack of GetNumCols().
- for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
- {
- ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter);
- smoothedGradients.push_back(Matrix(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId()));
- }
-#endif
-
- vector epochCriterion;
- double avgCriterion, prevCriterion;
- for (size_t i = 0; i < 2; i++)
- epochCriterion.push_back(std::numeric_limits::infinity());
- avgCriterion = prevCriterion = std::numeric_limits::infinity();
-
- size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
-
- std::vector epochEvalErrors(decoderEvaluationNodes.size(), std::numeric_limits::infinity());
-
- std::vector evalNodeNames;
- for (size_t i = 0; i < decoderEvaluationNodes.size(); i++)
- evalNodeNames.push_back(decoderEvaluationNodes[i]->NodeName());
-
- size_t totalSamplesSeen = 0;
- double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
-
- int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
- vector prevLearnRates;
- prevLearnRates.resize(m_numPrevLearnRates);
- for (int i = 0; i < m_numPrevLearnRates; i++)
- prevLearnRates[i] = std::numeric_limits::infinity();
-
- //precompute mean and invStdDev nodes and save initial model
- if ( /// to-do doesn't support pre-compute such as MVN here
- /// PreCompute(net, encoderTrainSetDataReader, encoderFeatureNodes, encoderlabelNodes, encoderInputMatrices) ||
- startEpoch == 0)
- {
- encoderNet->Save(GetEncoderModelNameForEpoch(int(startEpoch) - 1));
- decoderNet->Save(GetDecoderModelNameForEpoch(int(startEpoch) - 1));
- }
-
- bool learnRateInitialized = false;
- if (startEpoch > 0)
- learnRateInitialized = this->LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize);
-
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesParam.size() <= startEpoch)
- InvalidArgument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");
-
- ULONG dropOutSeed = 1;
- double prevDropoutRate = 0;
-
- bool learnRateReduced = false;
-
- for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
- {
- auto t_start_epoch = clock();
-
- //set dropout rate
- ComputationNetwork::SetDropoutRate(*encoderNet, encoderEvaluationNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
- ComputationNetwork::SetDropoutRate(*decoderNet, decoderCriterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-
- //learning rate adjustment
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesParam.size() > 0 && m_learningRatesParam.size() > i))
- {
- learnRatePerSample = GetLearningRatePerSample(i /*BUGBUG workaround:*/, encoderTrainSetDataReader->GetNumParallelSequences());
- }
- else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
- {
- NOT_IMPLEMENTED;
- }
-
- learnRateInitialized = true;
-
- if (learnRatePerSample < m_minLearnRate)
- {
- fprintf(stderr, "Learn Rate Per Sample for Epoch[%lu] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
- break;
- }
-
- TrainOneEpochEncoderDecoderWithHiddenStates(encoderNet, decoderNet, i,
- m_epochSize, encoderTrainSetDataReader,
- decoderTrainSetDataReader, learnRatePerSample,
- encoderFeatureNodes, encoderEvaluationNodes, &encoderInputMatrices,
- decoderFeatureNodes, decoderLabelNodes, decoderCriterionNodes, decoderEvaluationNodes,
- &decoderInputMatrices, learnableNodes, smoothedGradients,
- epochCriterion, epochEvalErrors, totalSamplesSeen);
-
- auto t_end_epoch = clock();
- double epochTime = 1.0 * (t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
-
- // fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion);
- fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Decoder Train Loss Per Sample = %.8g ", i + 1, epochCriterion[0]);
- if (epochEvalErrors.size() == 1)
- {
- fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime);
- }
- else
- {
- fprintf(stderr, "EvalErr Per Sample ");
- for (size_t j = 0; j < epochEvalErrors.size(); j++)
- fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
- fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime);
- fprintf(stderr, "Finished Epoch[%lu]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, decoderCriterionNodes[0]->NodeName().c_str(), epochCriterion[i + 1]);
- for (size_t j = 0; j < epochEvalErrors.size(); j++)
- fprintf(stderr, "Finished Epoch[%lu]: Evaluation Node [%ws] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
- }
-
- if (decoderValidationSetDataReader != decoderTrainSetDataReader && decoderValidationSetDataReader != nullptr &&
- encoderValidationSetDataReader != encoderTrainSetDataReader && encoderValidationSetDataReader != nullptr)
- {
- SimpleEvaluator evalforvalidation(*decoderNet);
- vector cvEncoderSetTrainAndEvalNodes;
- cvEncoderSetTrainAndEvalNodes.push_back(encoderEvaluationNodes[0]->NodeName());
-
- vector cvDecoderSetTrainAndEvalNodes;
- cvDecoderSetTrainAndEvalNodes.push_back(decoderCriterionNodes[0]->NodeName());
- cvDecoderSetTrainAndEvalNodes.push_back(decoderEvaluationNodes[0]->NodeName());
-
- vector vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
- encoderNet, decoderNet,
- encoderValidationSetDataReader,
- decoderValidationSetDataReader, cvEncoderSetTrainAndEvalNodes,
- cvDecoderSetTrainAndEvalNodes, m_mbSize[i]);
- fprintf(stderr, "Finished Epoch[%lu]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n",
- i + 1, vScore[0], vScore[1]);
-
- epochCriterion[0] = vScore[0]; //the first one is the decoder training criterion.
- }
-
- bool loadedPrevModel = false;
- size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
- if (avgCriterion == std::numeric_limits::infinity())
- avgCriterion = epochCriterion[0];
- else
- avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) * avgCriterion + epochCriterion[0]) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
-
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
- {
- if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity())
- {
- if (m_loadBestModel)
- {
- encoderNet->RereadPersistableParameters(GetEncoderModelNameForEpoch(i - 1));
- decoderNet->RereadPersistableParameters(GetDecoderModelNameForEpoch(i - 1));
-
- size_t dummyMinibatchSize = 0;
- this->LoadCheckPointInfo(i - 1,
- /*out*/ totalSamplesSeen,
- /*out*/ learnRatePerSample,
- smoothedGradients,
- /*out*/ prevCriterion,
- /*out*/ dummyMinibatchSize);
- fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
- loadedPrevModel = true;
- }
- }
-
- if (m_continueReduce)
- {
- if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
- if (learnRateReduced == false)
- {
- learnRateReduced = true;
- }
- else
- {
- decoderNet->Save(GetDecoderModelNameForEpoch(i, true));
- encoderNet->Save(GetEncoderModelNameForEpoch(i, true));
- fprintf(stderr, "Finished training and saved final model\n\n");
- break;
- }
- }
- if (learnRateReduced)
- {
- learnRatePerSample *= m_learnRateDecreaseFactor;
- fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
- }
- }
- else
- {
- if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
-
- learnRatePerSample *= m_learnRateDecreaseFactor;
- fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
- }
- else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
- learnRatePerSample *= m_learnRateIncreaseFactor;
- fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
- }
- }
- }
-
- if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them
- {
- prevCriterion = avgCriterion;
- epochsNotCountedInAvgCriterion = 0;
- }
-
- //persist model and check-point info
- decoderNet->Save(GetDecoderModelNameForEpoch(i));
- encoderNet->Save(GetEncoderModelNameForEpoch(i));
-
- size_t dummyMinibatchSize = 0;
- this->LoadCheckPointInfo(i,
- /*out*/ totalSamplesSeen,
- /*out*/ learnRatePerSample,
- smoothedGradients,
- /*out*/ prevCriterion,
- /*out*/ dummyMinibatchSize);
-
- if (!m_keepCheckPointFiles)
- _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space
-
- if (learnRatePerSample < 1e-12)
- fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample);
- }
- }
-
- void TrainEncoderDecoderModel(int startEpoch, vector nets,
- vector*> trainDataReader,
- vector*> validationDataReader)
- {
- size_t iNumNetworks = nets.size();
- vector*> featureNodes;
- vector*> outputNodes;
- vector*> pairNodes;
- vector*> labelNodes;
- vector*> criterionNodes;
- vector*> evaluationNodes;
- vector*>*> inputMatrices;
-
- for (size_t i = 0; i < iNumNetworks; i++)
- {
- auto* featPtr = &nets[i]->FeatureNodes();
- auto* lablPtr = &nets[i]->LabelNodes();
- featureNodes.push_back(featPtr);
- outputNodes.push_back(&nets[i]->OutputNodes());
- pairNodes.push_back(&nets[i]->PairNodes());
-
- labelNodes.push_back(lablPtr);
- criterionNodes.push_back(&GetTrainCriterionNodes(nets[i]));
- evaluationNodes.push_back(&GetEvalCriterionNodes(nets[i]));
-
- std::map*>* matrices;
- matrices = new std::map*>();
-
- for (size_t j = 0; j < featPtr->size(); j++)
- {
- (*matrices)[(*featPtr)[j]->NodeName()] =
- &(dynamic_pointer_cast>((*featPtr)[j])->Value());
- }
-
- for (size_t j = 0; j < lablPtr->size(); j++)
- {
- (*matrices)[(*lablPtr)[j]->NodeName()] =
- &(dynamic_pointer_cast>((*lablPtr)[j])->Value());
- }
- inputMatrices.push_back(matrices);
- }
-
- //initializing weights and gradient holder
- std::list learnableNodes;
- for (size_t i = 0; i < iNumNetworks; i++)
- {
- if (criterionNodes[i]->size() == 0)
- {
- for (auto ptr = evaluationNodes[i]->begin(); ptr != evaluationNodes[i]->end(); ptr++)
- {
- ComputationNodeBasePtr pptr = *ptr;
-
- const std::list& eachLearnableNodes = nets[i]->LearnableParameterNodes(pptr); //only one criterion so far TODO: support multiple ones?
- for (auto nodeIter = eachLearnableNodes.begin(); nodeIter != eachLearnableNodes.end(); nodeIter++)
- {
- ComputationNodeBasePtr node = *nodeIter;
- learnableNodes.push_back(node);
- }
- }
- }
- else
- {
- for (auto ptr = criterionNodes[i]->begin(); ptr != criterionNodes[i]->end(); ptr++)
- {
- ComputationNodeBasePtr pptr = *ptr;
-
- const std::list& eachLearnableNodes = nets[i]->LearnableParameterNodes(pptr); //only one criterion so far TODO: support multiple ones?
- for (auto nodeIter = eachLearnableNodes.begin(); nodeIter != eachLearnableNodes.end(); nodeIter++)
- {
- ComputationNodeBasePtr node = *nodeIter;
- learnableNodes.push_back(node);
- }
- }
- }
-
- //for (auto ptr = pairNodes[i]->begin(); ptr != pairNodes[i]->end(); ptr++)
- // nets[i]->BuildAndValidateSubNetwork(*ptr);
- }
-
- std::list> smoothedGradients;
-#if 0 // No longer functional due to lack of GetNumCols().
- for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
- {
- ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter);
- smoothedGradients.push_back(Matrix(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId()));
- }
-#endif
-
- double epochCriterion, avgCriterion, prevCriterion;
- epochCriterion = std::numeric_limits::infinity();
- avgCriterion = prevCriterion = std::numeric_limits::infinity();
-
- size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval;
-
- size_t iNumEvaluations = 0;
- for (size_t i = 0; i < iNumNetworks; i++)
- {
- iNumEvaluations += evaluationNodes[i]->size();
- }
- std::vector epochEvalErrors(iNumEvaluations, std::numeric_limits::infinity());
-
- std::vector evalNodeNames;
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- for (auto ptr = evaluationNodes[k]->begin(); ptr != evaluationNodes[k]->end(); ptr++)
- evalNodeNames.push_back((*ptr)->NodeName());
- }
-
- size_t totalSamplesSeen = 0;
- double learnRatePerSample = 0.5f / m_mbSize[startEpoch];
-
- int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation
- vector prevLearnRates;
- prevLearnRates.resize(m_numPrevLearnRates);
- for (int i = 0; i < m_numPrevLearnRates; i++)
- prevLearnRates[i] = std::numeric_limits::infinity();
-
- //precompute mean and invStdDev nodes and save initial model
- if ( /// to-do doesn't support pre-compute such as MVN here
- /// PreCompute(net, encoderTrainSetDataReader, encoderFeatureNodes, encoderlabelNodes, encoderInputMatrices) ||
- startEpoch == 0)
- {
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- wstring tmpstr = msra::strfun::wstrprintf(L".%d", k);
- nets[k]->Save(GetModelNameForEpoch(int(startEpoch) - 1, false, tmpstr));
- }
- }
-
- bool learnRateInitialized = false;
- if (startEpoch > 0)
- {
- size_t dummyMinibatchSize = 0;
- this->LoadCheckPointInfo(startEpoch - 1,
- /*out*/ totalSamplesSeen,
- /*out*/ learnRatePerSample,
- smoothedGradients,
- /*out*/ prevCriterion,
- /*out*/ dummyMinibatchSize);
- }
-
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesParam.size() <= startEpoch)
- InvalidArgument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch.");
-
- ULONG dropOutSeed = 1;
- double prevDropoutRate = 0;
-
- bool learnRateReduced = false;
-
- for (int i = int(startEpoch); i < int(m_maxEpochs); i++)
- {
- auto t_start_epoch = clock();
-
- //set dropout rate
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- if (evaluationNodes[k]->size() > 0)
- ComputationNetwork::SetDropoutRate(nets[k], (*evaluationNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
- if (criterionNodes[k]->size() > 0)
- ComputationNetwork::SetDropoutRate(nets[k], (*criterionNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
- }
-
- //learning rate adjustment
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesParam.size() > 0 && m_learningRatesParam.size() > i))
- {
- learnRatePerSample = GetLearningRatePerSample(i /*BUGBUG workaround:*/, trainDataReader[0]->GetNumParallelSequences());
- }
- else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch)
- {
- NOT_IMPLEMENTED;
- }
-
- learnRateInitialized = true;
-
- if (learnRatePerSample < m_minLearnRate)
- {
- fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate);
- break;
- }
-
- TrainOneEpochEncoderDecoderWithHiddenStates(i, m_epochSize, nets,
- trainDataReader,
- featureNodes,
- pairNodes,
- evaluationNodes,
- inputMatrices,
- labelNodes,
- criterionNodes,
- learnableNodes,
- learnRatePerSample,
- smoothedGradients,
- epochCriterion, epochEvalErrors, totalSamplesSeen);
-
- auto t_end_epoch = clock();
- double epochTime = 1.0 * (t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC);
-
- /**
- this is hacky. Only allow evaluatio on the first encoder->decoder pair
- */
- size_t decoderIdx = iNumNetworks - 1;
- IDataReader* decoderValidationSetDataReader = validationDataReader[decoderIdx];
- IDataReader* decoderTrainSetDataReader = trainDataReader[decoderIdx];
- ComputationNetworkPtr decoderNet = nets[decoderIdx];
-
- fprintf(stderr, "Finished Epoch[%d]: [Training Set] Decoder Train Loss Per Sample = %.8g ", i + 1, epochCriterion);
- if (epochEvalErrors.size() == 1)
- {
- fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime);
- }
- else
- {
- fprintf(stderr, "EvalErr Per Sample ");
- for (size_t j = 0; j < epochEvalErrors.size(); j++)
- fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]);
- fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime);
- fprintf(stderr, "Finished Epoch[%d]: Criterion Node Per Sample = %.8g\n", i + 1, epochCriterion);
- for (size_t j = 0; j < epochEvalErrors.size(); j++)
- fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]);
- }
-
- if (decoderValidationSetDataReader != decoderTrainSetDataReader && decoderValidationSetDataReader != nullptr)
- {
- MultiNetworksEvaluator evalforvalidation(decoderNet);
-
- double vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates(
- nets,
- validationDataReader,
- m_mbSize[i]);
-
- fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Loss Per Sample = %.8g \n ", i + 1, vScore);
-
- epochCriterion = vScore;
- }
-
- bool loadedPrevModel = false;
- size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1;
- if (avgCriterion == std::numeric_limits::infinity())
- avgCriterion = epochCriterion;
- else
- avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) * avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion);
-
- if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval)
- {
- if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity())
- {
- if (m_loadBestModel)
- {
- //persist model and check-point info
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- nets[k]->RereadPersistableParameters(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
- nets[k]->ResetEvalTimeStamps();
- }
-
- size_t dummyLr = 0;
- this->LoadCheckPointInfo(i - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, dummyLr);
- fprintf(stderr, "Loaded the previous model which has better training criterion.\n");
- loadedPrevModel = true;
- }
- }
-
- if (m_continueReduce)
- {
- if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
- if (learnRateReduced == false)
- {
- learnRateReduced = true;
- }
- else
- {
- //persist model and check-point info
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- nets[k]->Save(GetModelNameForEpoch(i, true, msra::strfun::wstrprintf(L".%d", k)));
- }
- fprintf(stderr, "Finished training and saved final model\n\n");
- break;
- }
- }
- if (learnRateReduced)
- {
- learnRatePerSample *= m_learnRateDecreaseFactor;
- fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
- }
- }
- else
- {
- if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
-
- learnRatePerSample *= m_learnRateDecreaseFactor;
- fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample);
- }
- else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion && prevCriterion != std::numeric_limits::infinity())
- {
- learnRatePerSample *= m_learnRateIncreaseFactor;
- fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample);
- }
- }
- }
-
- if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them
- {
- prevCriterion = avgCriterion;
- epochsNotCountedInAvgCriterion = 0;
- }
-
- //persist model and check-point info
- for (size_t k = 0; k < iNumNetworks; k++)
- {
- nets[k]->Save(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
- }
-
- this->SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, 0);
- if (!m_keepCheckPointFiles)
- _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space
-
- if (learnRatePerSample < 1e-12)
- fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample);
- }
-
- for (size_t i = 0; i < iNumNetworks; i++)
- {
- delete inputMatrices[i];
- }
- }
-
- /// use hidden states between encoder and decoder to communicate between two networks
- void TrainOneEpochEncoderDecoderWithHiddenStates(
- const int epochNumber,
- const size_t epochSize,
- vector nets, /// encoder network
- vector*> dataReader,
- vector*> featureNodes,
- vector*> pairNodes,
- vector