Skip to content

Commit

Permalink
Addressed some pending CR feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
amitaga committed Mar 23, 2016
1 parent 7c01437 commit a5a1723
Show file tree
Hide file tree
Showing 12 changed files with 41 additions and 57 deletions.
4 changes: 2 additions & 2 deletions Source/CNTK/CNTK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;

// set up progress tracing for compute cluster management
if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
if (progressTracing && (!mpi || mpi->IsMainNode()))
{
ProgressTracing::SetTracingFlag();
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
Expand Down Expand Up @@ -304,7 +304,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
ndlScript.ClearGlobal(); // clear global macros between commands

// Synchronize all ranks before proceeding to next action/command
if (mpi != nullptr)
if (mpi)
mpi->WaitAll();
}
}
Expand Down
7 changes: 5 additions & 2 deletions Source/Common/Include/MPIWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ static int operator||(int rc, const MpiFail &what)
RuntimeError("%s", what.c_str());
}

class MPIWrapper;
typedef std::shared_ptr<MPIWrapper> MPIWrapperPtr;

class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
{
int m_myRank;
Expand All @@ -59,7 +62,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
// MPI communicator that reflects the current subset selection
MPI_Comm m_currentComm;

static std::shared_ptr<MPIWrapper> s_mpi;
static MPIWrapperPtr s_mpi;

// MPI_Init() with delay-loading the msmpi.dll (possibly causing a failure if missing; we want to catch that)
int MPI_Init_DL()
Expand Down Expand Up @@ -223,7 +226,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>

public:

static std::shared_ptr<MPIWrapper> GetInstance(bool create = false)
static MPIWrapperPtr GetInstance(bool create = false)
{
static bool initialized = false;
if (create)
Expand Down
18 changes: 18 additions & 0 deletions Source/Common/Include/ProgressTracing.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {

printf("EVALERR: %.7f%%\n", err);
}

// This prints a PROGRESS message with a percentage value of 0 to prevent timeouts on Philly
// when executing long running non-training operations like PreCompute, CV, Eval, and Write
static size_t TraceFakeProgress(size_t numIterationsBeforePrintingProgress, size_t numItersSinceLastPrintOfProgress)
{
size_t newNumItersSinceLastPrintOfProgress = numItersSinceLastPrintOfProgress;
if (GetTracingFlag())
{
newNumItersSinceLastPrintOfProgress++;
if (newNumItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
printf("PROGRESS: %.2f%%\n", 0.0f);
newNumItersSinceLastPrintOfProgress = 0;
}
}

return newNumItersSinceLastPrintOfProgress;
}
};
} } }
Original file line number Diff line number Diff line change
Expand Up @@ -678,8 +678,7 @@ void ComputationNetwork::MarkValueNonSharableNodes()
std::list<ComputationNodeBasePtr> allPreComputeNodes;
for (const auto& node : nodes)
{
auto pcnode = dynamic_pointer_cast<IPreComputeNode>(node);
if (pcnode)
if (node->Is<IPreComputeNode>())
allPreComputeNodes.push_back(node);
}

Expand Down
2 changes: 1 addition & 1 deletion Source/SGDLib/DataReaderHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool useParallelTrain,
StreamMinibatchInputs& inputMatrices,
size_t& actualMBSize,
const std::shared_ptr<MPIWrapper>& mpi)
const MPIWrapperPtr& mpi)
{
auto pMBLayout = net->GetMBLayoutPtr();
// Reading consists of a sequence of Reader API calls:
Expand Down
4 changes: 2 additions & 2 deletions Source/SGDLib/IDistGradAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ template <class ElemType>
class IDistGradAggregator
{
public:
IDistGradAggregator(const std::shared_ptr<MPIWrapper>& mpi)
IDistGradAggregator(const MPIWrapperPtr& mpi)
: m_mpi(mpi)
{
}
Expand Down Expand Up @@ -37,7 +37,7 @@ class IDistGradAggregator
}

protected:
std::shared_ptr<MPIWrapper> m_mpi;
MPIWrapperPtr m_mpi;
};

#define UsingIDistGradAggregatorMembers \
Expand Down
6 changes: 3 additions & 3 deletions Source/SGDLib/MASGD.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
public:
IMASGD(const std::shared_ptr<MPIWrapper>& pMPI, size_t perfReportFreq)
IMASGD(const MPIWrapperPtr& pMPI, size_t perfReportFreq)
:m_MAworkerStatus(pMPI->NumNodesInUse(), MAWorkerStatus::NOTSTARTED),
m_numSyncPerformed(0),
m_numWorkers(pMPI->NumNodesInUse()),
Expand Down Expand Up @@ -286,7 +286,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t m_numWorkers;
size_t m_myRank;
MASGDPerfStats m_perfReporter;
std::shared_ptr<MPIWrapper> m_pMPI;
MPIWrapperPtr m_pMPI;
};


Expand All @@ -299,7 +299,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
using Base::DownCast;

public:
BasicModelAveragingSGD(const std::shared_ptr<MPIWrapper>& pMPI, size_t reportFreq)
BasicModelAveragingSGD(const MPIWrapperPtr& pMPI, size_t reportFreq)
:Base(pMPI, reportFreq)
{}

Expand Down
11 changes: 1 addition & 10 deletions Source/SGDLib/SGD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1323,16 +1323,7 @@ bool SGD<ElemType>::PreCompute(ComputationNetworkPtr net,

net->ForwardProp(nodes);

if (ProgressTracing::GetTracingFlag())
{
numItersSinceLastPrintOfProgress++;
if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
// TODO: For now just print 0.0 instead of calculating actual progress
printf("PROGRESS: %.2f%%\n", 0.0f);
numItersSinceLastPrintOfProgress = 0;
}
}
numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);
}

// finalize
Expand Down
4 changes: 2 additions & 2 deletions Source/SGDLib/SGD.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ struct SGDParams : public ScriptableObjects::Object
bool m_useAllDataForPreComputedNode;

// Parallel training
std::shared_ptr<MPIWrapper> m_mpi;
MPIWrapperPtr m_mpi;

ParallelizationMethod m_parallelizationMethod;
bool m_enableDistributedMBReading;
Expand Down Expand Up @@ -315,7 +315,7 @@ class SGD : public SGDParams
{
}

void InitMPI(const std::shared_ptr<MPIWrapper>& mpi)
void InitMPI(const MPIWrapperPtr& mpi)
{
m_mpi = mpi;

Expand Down
2 changes: 1 addition & 1 deletion Source/SGDLib/SimpleDistGradAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class SimpleDistGradAggregator : public IDistGradAggregator<ElemType>
UsingIDistGradAggregatorMembers;

public:
SimpleDistGradAggregator(const std::shared_ptr<MPIWrapper>& mpi, bool useAsyncAggregation, int syncStatsTrace)
SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int syncStatsTrace)
: IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_currentEpochNumber(-1), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0)
{
}
Expand Down
15 changes: 3 additions & 12 deletions Source/SGDLib/SimpleEvaluator.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <class ElemType>
class SimpleEvaluator
{
public:
SimpleEvaluator(ComputationNetworkPtr net, const std::shared_ptr<MPIWrapper>& mpi, const size_t numMBsToShowResult = 100, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
SimpleEvaluator(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, const size_t numMBsToShowResult = 100, const int traceLevel = 0, const size_t maxSamplesInRAM = SIZE_MAX,
const size_t numSubminiBatches = 1)
: m_net(net),
m_numMBsToShowResult(numMBsToShowResult),
Expand Down Expand Up @@ -210,16 +210,7 @@ class SimpleEvaluator
}


if (ProgressTracing::GetTracingFlag())
{
numItersSinceLastPrintOfProgress++;
if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
// TODO: For now just print 0.0 instead of calculating actual progress
printf("PROGRESS: %.2f%%\n", 0.0f);
numItersSinceLastPrintOfProgress = 0;
}
}
numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);

// call DataEnd to check if end of sentence is reached
// datareader will do its necessary/specific process for sentence ending
Expand Down Expand Up @@ -289,7 +280,7 @@ class SimpleEvaluator
size_t m_numMBsToShowResult;
size_t m_maxSamplesInRAM;
size_t m_numSubminiBatches;
std::shared_ptr<MPIWrapper> m_mpi;
MPIWrapperPtr m_mpi;

shared_ptr<IDistGradAggregator<ElemType>> m_distGradAgg;
struct DistGradHeader* m_gradHeader;
Expand Down
22 changes: 2 additions & 20 deletions Source/SGDLib/SimpleOutputWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,7 @@ class SimpleOutputWriter

totalEpochSamples += actualMBSize;

if (ProgressTracing::GetTracingFlag())
{
numItersSinceLastPrintOfProgress++;
if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
// TODO: For now just print 0.0 instead of calculating actual progress
printf("PROGRESS: %.2f%%\n", 0.0f);
numItersSinceLastPrintOfProgress = 0;
}
}
numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);

// call DataEnd function in dataReader to do
// reader specific process if sentence ending is reached
Expand Down Expand Up @@ -292,16 +283,7 @@ class SimpleOutputWriter

fprintf(stderr, "Minibatch[%lu]: ActualMBSize = %lu\n", ++numMBsRun, actualMBSize);

if (ProgressTracing::GetTracingFlag())
{
numItersSinceLastPrintOfProgress++;
if (numItersSinceLastPrintOfProgress >= numIterationsBeforePrintingProgress)
{
// TODO: For now just print 0.0 instead of calculating actual progress
printf("PROGRESS: %.2f%%\n", 0.0f);
numItersSinceLastPrintOfProgress = 0;
}
}
numItersSinceLastPrintOfProgress = ProgressTracing::TraceFakeProgress(numIterationsBeforePrintingProgress, numItersSinceLastPrintOfProgress);

// call DataEnd function in dataReader to do
// reader specific process if sentence ending is reached
Expand Down

0 comments on commit a5a1723

Please sign in to comment.