Refactor to use conc_stack.

coderye · Oct 14, 2015 · a5acf9d · a5acf9d
1 parent 98324b8
commit a5acf9d
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 26 deletions.
diff --git a/Common/Include/basetypes.h b/Common/Include/basetypes.h
@@ -84,10 +84,12 @@ OACR_WARNING_DISABLE(POTENTIAL_ARGUMENT_TYPE_MISMATCH, "Not level1 or level2_sec
 #include <locale>       // std::wstring_convert
 #include <string>
 #include <algorithm>    // for transform()
-#include <mutex>
 #include <unordered_map>
 #include <chrono>
 #include <thread>
+#include <stack>
+#include <mutex>
+#include <memory>
 #ifdef _MSC_VER
 #include <codecvt>      // std::codecvt_utf8
 #endif
@@ -1004,4 +1006,47 @@ static inline std::wstring FormatWin32Error(DWORD error)
 	return res;
 }
 #endif // _WIN32
+
+// Very simple version of thread-safe stack. Add other functions as needed.
+template<typename T>
+class conc_stack
+{
+public:
+    typedef typename std::stack<T>::value_type value_type;
+
+    conc_stack() {}
+
+    value_type pop_or_create(std::function<value_type()> factory)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        if (m_stack.size() == 0)
+            return factory();
+        auto res = std::move(m_stack.top());
+        m_stack.pop();
+        return res;
+    }
+
+    void push(const value_type& item)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        m_stack.push(item);
+    }
+
+    void push(value_type&& item)
+    {
+        std::lock_guard<std::mutex> g(m_locker);
+        m_stack.push(std::forward<value_type>(item));
+    }
+
+public:
+    conc_stack(const conc_stack&) = delete;
+    conc_stack& operator=(const conc_stack&) = delete;
+    conc_stack(conc_stack&&) = delete;
+    conc_stack& operator=(conc_stack&&) = delete;
+
+private:
+    std::stack<value_type> m_stack;
+    std::mutex m_locker;
+};
+
 #endif    // _BASETYPES_
diff --git a/DataReader/ImageReader/ImageReader.cpp b/DataReader/ImageReader/ImageReader.cpp
@@ -37,7 +37,7 @@ class ITransform
 class CropTransform : public ITransform
 {
 public:
-    CropTransform(unsigned int seed) : m_rng(seed)
+    CropTransform(unsigned int seed) : m_seed(seed)
     {
     }
 
@@ -70,22 +70,27 @@ class CropTransform : public ITransform
 
     void Apply(cv::Mat& mat)
     {
+        auto seed = m_seed;
+        auto rng = m_rngs.pop_or_create([seed]() { return std::make_unique<std::mt19937>(seed); });
+
         double ratio = 1;
         switch (m_jitterType)
         {
         case RatioJitterType::None:
             ratio = m_cropRatioMin;
             break;
         case RatioJitterType::UniRatio:
-            ratio = UniRealT(m_cropRatioMin, m_cropRatioMax)(m_rng);
+            ratio = UniRealT(m_cropRatioMin, m_cropRatioMax)(*rng);
             assert(m_cropRatioMin <= ratio && ratio < m_cropRatioMax);
             break;
         default:
             RuntimeError("Jitter type currently not implemented.");
         }
-        mat = mat(GetCropRect(m_cropType, mat.rows, mat.cols, ratio));
-        if (m_hFlip && std::bernoulli_distribution()(m_rng))
+        mat = mat(GetCropRect(m_cropType, mat.rows, mat.cols, ratio, *rng));
+        if (m_hFlip && std::bernoulli_distribution()(*rng))
             cv::flip(mat, mat, 1);
+
+        m_rngs.push(std::move(rng));
     }
 
 private:
@@ -130,7 +135,7 @@ class CropTransform : public ITransform
         RuntimeError("Invalid jitter type: %s.", src.c_str());
     }
 
-    cv::Rect GetCropRect(CropType type, int crow, int ccol, double cropRatio)
+    cv::Rect GetCropRect(CropType type, int crow, int ccol, double cropRatio, std::mt19937& rng)
     {
         assert(crow > 0);
         assert(ccol > 0);
@@ -146,8 +151,8 @@ class CropTransform : public ITransform
             yOff = (crow - cropSize) / 2;
             break;
         case CropType::Random:
-            xOff = UniIntT(0, ccol - cropSize)(m_rng);
-            yOff = UniIntT(0, crow - cropSize)(m_rng);
+            xOff = UniIntT(0, ccol - cropSize)(rng);
+            yOff = UniIntT(0, crow - cropSize)(rng);
             break;
         default:
             assert(false);
@@ -159,8 +164,8 @@ class CropTransform : public ITransform
     }
 
 private:
-    // REVIEW alexeyk: currently not thread safe. Engines are expensive to create.
-    std::mt19937 m_rng;
+    unsigned int m_seed;
+    conc_stack<std::unique_ptr<std::mt19937>> m_rngs;
 
     CropType m_cropType;
     double m_cropRatioMin;
@@ -172,7 +177,7 @@ class CropTransform : public ITransform
 class ScaleTransform : public ITransform
 {
 public:
-    ScaleTransform(int dataType, unsigned int seed) : m_dataType(dataType), m_rng(seed)
+    ScaleTransform(int dataType, unsigned int seed) : m_dataType(dataType), m_seed(seed)
     {
         assert(m_dataType == CV_32F || m_dataType == CV_64F);
 
@@ -211,15 +216,21 @@ class ScaleTransform : public ITransform
         if (mat.type() != CV_MAKETYPE(m_dataType, m_imgChannels))
             mat.convertTo(mat, m_dataType);
 
+        auto seed = m_seed;
+        auto rng = m_rngs.pop_or_create([seed]() { return std::make_unique<std::mt19937>(seed); });
+
         assert(m_interp.size() > 0);
         cv::resize(mat, mat, cv::Size(static_cast<int>(m_imgWidth), static_cast<int>(m_imgHeight)), 0, 0, 
-            m_interp[UniIntT(0, static_cast<int>(m_interp.size()) - 1)(m_rng)]);
+            m_interp[UniIntT(0, static_cast<int>(m_interp.size()) - 1)(*rng)]);
+
+        m_rngs.push(std::move(rng));
     }
 
 private:
     using UniIntT = std::uniform_int_distribution<int>;
-    // REVIEW alexeyk: currently not thread safe. Engines are expensive to create.
-    std::mt19937 m_rng;
+
+    unsigned int m_seed;
+    conc_stack<std::unique_ptr<std::mt19937>> m_rngs;
 
     int m_dataType;
 

diff --git a/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h b/MachineLearning/CNTKComputationNetworkLib/EvaluationCriterionNodes.h
@@ -42,7 +42,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
         {
-            EvaluateThisNodeS(m_functionValues, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_maxIndexes0, m_maxIndexes1, m_maxValues, m_topK, m_workspace, shared_from_this());
+            EvaluateThisNodeS(m_functionValues, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), m_maxIndexes0, m_maxIndexes1, m_maxValues, m_topK, shared_from_this());
         }
 
         void EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& inputFunctionValues0, const Matrix<ElemType>& inputFunctionValues1, Matrix<ElemType>& maxIndexes0, Matrix<ElemType>& maxIndexes1, Matrix<ElemType>& maxValues, ComputationNodePtr curNode)

diff --git a/Math/CNTKMathTest/MatrixUnitTests.cpp b/Math/CNTKMathTest/MatrixUnitTests.cpp
@@ -845,10 +845,9 @@ namespace CNTKMathTest
                 Matrix<float> actual(3, 2, src, matrixFlagNormal, deviceId);
                 Matrix<float> actualIdx(deviceId);
                 Matrix<float> actualVal(deviceId);
-                Matrix<float> temp(deviceId);
 
                 int topK = 2;
-                actual.VectorMax(actualIdx, actualVal, true, topK, temp);
+                actual.VectorMax(actualIdx, actualVal, true, topK);
                 Assert::IsTrue(actualIdx.IsEqualTo(expIdx));
                 Assert::IsTrue(actualVal.IsEqualTo(expVal));
             }

diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
@@ -485,6 +485,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     GPUMatrix<ElemType>::~GPUMatrix(void)
     {
         Clear();
+        if (m_workspace != nullptr)
+            delete m_workspace;
     }
 
     template<class ElemType>
@@ -2950,7 +2952,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK, GPUMatrix<ElemType>& workspace) const
+    void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const
     {
         if (IsEmpty())
             throw std::logic_error("VectorMax: Matrix is empty.");
@@ -3005,9 +3007,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cbtemp = ctemp * sizeof(ElemType);
         // ElemType count needed to store indices, accounting for natural alignment for uint64_t type.
         size_t cidx = ((celt + 1) * sizeof(uint64_t) - 1 + sizeof(ElemType) - 1) / sizeof(ElemType);
+        // Prepare temp workspace.
+        auto deviceId = m_computeDevice;
+        assert(m_workspace != nullptr);
+        auto workspace = m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
         // Resize to store: output values for the 1st and 2nd passes, input indices, output indices, and temp storage.
-        workspace.Resize(m, 2 * n + (2 * cidx + ctemp + m - 1) / m);
-        outVal1 = workspace.m_pArray;
+        workspace->Resize(m, 2 * n + (2 * cidx + ctemp + m - 1) / m);
+        outVal1 = workspace->m_pArray;
         outVal2 = outVal1 + celt;
         inIdx = reinterpret_cast<uint64_t*>(outVal2 + celt);
         // Align indices pointer if needed.
@@ -3016,7 +3022,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             reinterpret_cast<uint8_t*&>(inIdx) += sizeof(uint64_t) - cbAlign;
         outIdx = inIdx + celt;
         void* ptmp = outIdx + celt;
-        assert(reinterpret_cast<ElemType*>(reinterpret_cast<uint8_t*>(ptmp) + cbtemp) <= workspace.m_pArray + workspace.GetNumElements());
+        assert(reinterpret_cast<ElemType*>(reinterpret_cast<uint8_t*>(ptmp) + cbtemp) <= workspace->m_pArray + workspace->GetNumElements());
 
         // Initialize indices.
         const int ThreadsPerBlock = 128;
@@ -3032,6 +3038,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         cblock = (topK * n + ThreadsPerBlock - 1) / ThreadsPerBlock;
         _copyTopKResults<<<cblock, ThreadsPerBlock, 0, t_stream>>>(inIdx, outVal2, maxIndexes.m_pArray, maxValues.m_pArray, m, n, topK);
 
+        m_workspace->push(std::move(workspace));
 #ifndef _DEBUG
         UNUSED(err);
 #endif

diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
@@ -88,6 +88,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         static cublasHandle_t s_cuHandle[MaxGpus];
         static void *s_curandGenerator;
 
+        // Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows.
+        // REVIEW alexeyk: can be allocated lazily but the current footprint is small anyway.
+        mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>;
+
     private:
         void performInplaceFunction(int kind);
         size_t LocateElement (const size_t i, const size_t j) const;
@@ -295,7 +299,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         GPUMatrix<ElemType>&  AddFoldedPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
 
         void VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK, GPUMatrix<ElemType>& workspace) const;
+        void VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<ElemType>& maxValues, const bool isColWise, int topK) const;
         void VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<ElemType>& minValues, const bool isColWise) const;
 
         GPUMatrix<ElemType>&   AssignNumOfDiff(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, bool searchInCol = false); 

diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
@@ -3400,20 +3400,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK, Matrix<ElemType>& workspace) const
+    void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
     {
         if (IsEmpty())
             throw std::logic_error("VectorMax: Matrix is empty.");
 
         DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
         maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
         maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
-        workspace.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
 
         DISPATCH_MATRIX_ON_FLAG(this,
             &maxValues,
             this->m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(CPU, DENSE),
-            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK, *workspace.m_GPUMatrix); maxIndexes.SetDataLocation(GPU, DENSE),
+            this->m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK); maxIndexes.SetDataLocation(GPU, DENSE),
             NOT_IMPLEMENTED,
             NOT_IMPLEMENTED
             );

diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
@@ -368,7 +368,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
         Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
         void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
-        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK, Matrix<ElemType>& workspace) const;
+        void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
         void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;
 
         Matrix<ElemType>&  AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);