Skip to content

Commit

Permalink
Fixed a memshare bug that was incorrectly reallocation network matric…
Browse files Browse the repository at this point in the history
…es during cross-validation though the allocation had been established before the training commencement
  • Loading branch information
amitaga committed Mar 18, 2016
1 parent ead48be commit 9d750a8
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 1 deletion.
3 changes: 3 additions & 0 deletions Source/ComputationNetworkLib/ComputationNetwork.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
ComputationNetwork()
: m_randomSeedOffset(0),
m_isCompiled(false),
m_areMatricesAllocated(false),
m_pMBLayout(make_shared<MBLayout>())
{
}
Expand Down Expand Up @@ -169,6 +170,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
void CollectInputAndLearnableParametersRec(const ComputationNodeBasePtr& node, set<ComputationNodeBasePtr>& visited, list<ComputationNodeBasePtr>& inputs, list<ComputationNodeBasePtr>& learnableParameters);
bool IsCompiled() const { return m_isCompiled; }
bool AreMatricesAllocated() const { return m_areMatricesAllocated; }
void VerifyIsCompiled(const char* where) const;
public:
void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);
Expand Down Expand Up @@ -884,6 +886,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb

// cache for evaluation ordering:
bool m_isCompiled; // CompileNetwork has been called
bool m_areMatricesAllocated; // AllocateAllMatrices has been called

// cached network iterations
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_evalOrders; // [out node] flat depth-first traversal starting from out node
Expand Down
5 changes: 5 additions & 0 deletions Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,9 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
const std::vector<ComputationNodeBasePtr>& outValueRootNodes,
ComputationNodeBasePtr trainRootNode)
{
if (AreMatricesAllocated())
return;

// Allocate memory for forward/backward computation
fprintf(stderr, "\n\nAllocating matrices for forward and/or backward propagation.\n");

Expand Down Expand Up @@ -833,6 +836,8 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
}
}
}

m_areMatricesAllocated = true;
}

void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Instances=2
NumCPUThreads=$(threadsPerInstance $Instances)

# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
cntkmpirun "-n $Instances" cntkcv.cntk "numCPUThreads=$NumCPUThreads"
cntkmpirun "-n $Instances" cntkcv.cntk "numCPUThreads=$NumCPUThreads shareNodeValueMatrices=true"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
Expand Down

0 comments on commit 9d750a8

Please sign in to comment.