Skip to content

Commit

Permalink
Fixed progress reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisbasoglu committed Dec 3, 2015
1 parent 49b13b2 commit 7008d74
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 99 deletions.
194 changes: 106 additions & 88 deletions Common/Include/ProgressTracing.h
Original file line number Diff line number Diff line change
@@ -1,88 +1,106 @@
//
// <copyright file="SGD.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#pragma once

#include "TimerUtility.h"

namespace Microsoft { namespace MSR { namespace CNTK {

// ---------------------------------------------------------------------------
// ProgressTracing -- static helper class for logging a progress indicator
//
// This is for use by the cluster management tools for indicating global progress to the user.
//
// This logs to stdout (not stderr) in a specific format, e.g. understood by the Philly cluster. The format is:
// PROGRESS xx.xx%
// EVALERR xx.xx%
//
// Specifically, this class handles a two-level progress computation:
// - outer level: loop over multiple training phases, each running multiple steps (epochs)
// - inner level in one training phase: loop over multiple steps, *without* knowledge about the other training phases
//
// In order for the inner level to log correctly in the global context, the outer loop
// must inform this class about the total number of steps and the current offset to apply in the inner level.
// ---------------------------------------------------------------------------

/*static*/ class ProgressTracing
{
bool m_enabled;
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
size_t m_currentStepOffset; // current offset
Timer m_progressTracingTimer;
ProgressTracing() : m_enabled(false), m_totalNumberOfSteps(0), m_currentStepOffset(0) { }
static ProgressTracing & GetStaticInstance() { static ProgressTracing us; return us; } // wrap static state in an accessor, so we won't need a CPP file
public:
// call TraceTotalNumberOfSteps() to set the total number of steps
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
{
auto & us = GetStaticInstance();
us.m_enabled = totalNumberOfSteps > 0;
if (us.m_enabled)
{
us.m_totalNumberOfSteps = totalNumberOfSteps;
us.m_progressTracingTimer.Start();
}
}
// call SetStepOffset() at start of a multi-epoch training to set the index of the first epoch in that training
// This value is added to the local epoch index in TraceProgress().
static void SetStepOffset(size_t currentStepOffset) { GetStaticInstance().m_currentStepOffset = currentStepOffset; }
// emit the trace message for global progress
// 'currentStep' will be offset by m_currentStepOffset.
// This only prints of enough time (10s) has elapsed since last print, and the return value is 'true' if it did print.
static bool TraceProgressPercentage(size_t currentStep, double progressWithinStep/*0..1*/)
{
auto & us = GetStaticInstance();
if (!us.m_enabled)
return false;
// in case we are not able to estimate, we will increase as needed
// BUGBUG: This is a workaround because in BrainScript we cannot estimate the total number of epochs without actually running the actions.
if (currentStep + 1 > us.m_totalNumberOfSteps)
us.m_totalNumberOfSteps = currentStep + 1;
// compute global progress
bool needToPrint = us.m_progressTracingTimer.ElapsedSeconds() > 0;// 10;
if (needToPrint)
{
size_t globalStep = currentStep + us.m_currentStepOffset;
double globalStepPartial = (double)globalStep + progressWithinStep;
double progress = globalStepPartial / us.m_totalNumberOfSteps;
printf("PROGRESS: %.2f%%\n", 100.0 * progress);
us.m_progressTracingTimer.Restart();
}
return needToPrint;
}
// emit a trace message for the error objective
// The value is printed in percent.
static void TraceObjectivePercentage(double err)
{
auto & us = GetStaticInstance();
if (!us.m_enabled)
return;
printf("EVALERR: %.2f%%\n", 100.0 * err);
}
};

}}}
//
// <copyright file="SGD.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
#pragma once

#include "TimerUtility.h"

namespace Microsoft { namespace MSR { namespace CNTK {

// ---------------------------------------------------------------------------
// ProgressTracing -- static helper class for logging a progress indicator
//
// This is for use by the cluster management tools for indicating global progress to the user.
//
// This logs to stdout (not stderr) in a specific format, e.g. understood by the Philly cluster. The format is:
// PROGRESS xx.xx%
// EVALERR xx.xx%
//
// Specifically, this class handles a two-level progress computation:
// - outer level: loop over multiple training phases, each running multiple steps (epochs)
// - inner level in one training phase: loop over multiple steps, *without* knowledge about the other training phases
//
// In order for the inner level to log correctly in the global context, the outer loop
// must inform this class about the total number of steps and the current offset to apply in the inner level.
// ---------------------------------------------------------------------------

/*static*/ class ProgressTracing
{
bool m_enabled;
size_t m_totalNumberOfSteps; // total number of epochs in entire training run
size_t m_currentStepOffset; // current offset
Timer m_progressTracingTimer;

ProgressTracing()
: m_enabled(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
{
}

static ProgressTracing & GetStaticInstance()
{
static ProgressTracing us;
return us;
} // wrap static state in an accessor, so we won't need a CPP file

public:
// call TraceTotalNumberOfSteps() to set the total number of steps
// Calling this with totalNumberOfSteps>0 will enable progress tracing.
static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
{
auto & us = GetStaticInstance();
us.m_enabled = totalNumberOfSteps > 0;
if (us.m_enabled)
{
us.m_totalNumberOfSteps = totalNumberOfSteps;
us.m_progressTracingTimer.Start();
}
}

// call SetStepOffset() at start of a multi-epoch training to set the index of the first epoch in that training
// This value is added to the local epoch index in TraceProgress().
static void SetStepOffset(size_t currentStepOffset)
{
GetStaticInstance().m_currentStepOffset = currentStepOffset;
}

// emit the trace message for global progress
// 'currentStep' will be offset by m_currentStepOffset.
// This only prints of enough time (10s) has elapsed since last print, and the return value is 'true' if it did print.
static bool TraceProgressPercentage(size_t epochNumber, double mbProg/*0..100*/)
{
auto & us = GetStaticInstance();
if (!us.m_enabled)
{
return false;
}

// compute global progress
bool needToPrint = us.m_progressTracingTimer.ElapsedSeconds() > 0;// 10;
if (needToPrint)
{
double epochProg = ((100.0f * (float)(us.m_currentStepOffset + epochNumber)) / (double)us.m_totalNumberOfSteps);
mbProg = (mbProg * 100.0f) / (float)us.m_totalNumberOfSteps;
printf("PROGRESS: %.2f%%\n", epochProg + mbProg);
us.m_progressTracingTimer.Restart();
}
return needToPrint;
}

// emit a trace message for the error objective
// The value is printed in percent.
static void TraceObjectivePercentage(double err)
{
auto & us = GetStaticInstance();

if (!us.m_enabled)
{
return;
}

printf("EVALERR: %.2f%%\n", 100.0 * err);
}
};

}}}
4 changes: 4 additions & 0 deletions MachineLearning/CNTK/CNTK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1512,7 +1512,9 @@ void DoCommands(const ConfigParameters& config)

// set up progress tracing for compute cluster management
if (progressTracing && ((g_mpi == nullptr) || g_mpi->IsMainNode()))
{
ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs
}

size_t fullEpochsOffset = 0;

Expand All @@ -1524,7 +1526,9 @@ void DoCommands(const ConfigParameters& config)
ConfigArray action = commandParams("action", "train");

if (progressTracing && ((g_mpi == nullptr) || g_mpi->IsMainNode()))
{
ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
}

// determine the action to perform, and do it
for (int j = 0; j < action.size(); j++)
Expand Down
Loading

0 comments on commit 7008d74

Please sign in to comment.