diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
new file mode 100644
index 000000000000..35bfa8bb0e61
--- /dev/null
+++ b/Math/Math/CPUMatrix.cpp
@@ -0,0 +1,5473 @@
+//
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//
+//
+// CPUMatrix.cpp : full implementation of all matrix functions on the CPU side
+//
+
+#include "stdafx.h"
+#include "Basics.h"
+#include "fileutil.h"
+
+#include
+#include
+#include
+#include
+#include "CPUMatrix.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef _WIN32
+#include
+#else
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+#include
+#endif
+
+#ifdef LEAKDETECT
+#include
+#endif
+
+#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
+#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
+
+#ifndef USE_MKL
+// use ACML as default.
+// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
+// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/
+// Install the ifort64_mp variant (compiled with intel compiler) of the library
+// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml
+// to point to your folder for the include file and link library
+#include // requires ACML 5.3.1 and above
+#else
+// requires MKL 10.0 and above
+#include
+#endif
+
+#ifndef USE_MKL //MKL has one additional parameter for different matrix order
+#define BLAS_COLMAJOR
+#else
+#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor,
+#endif
+
+#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);}
+#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+#pragma region Helpful Enum Definitions
+ enum class MatrixOrder
+ {
+ RowMajor = 101, // row-major arrays
+ ColMajor = 102 // column-major arrays
+ };
+
+ enum class MatrixTranspose : char
+ {
+ NoTrans = 'N', // trans='N'
+ Trans = 'T', // trans='T'
+ ConjTrans = 'C' // trans='C'
+ };
+
+ enum class SymMatrixType : char
+ {
+ Up = 'U', // symmetric matrix is stored in the upper part
+ Low = 'L', // symmetric matrix is stored in thelower part
+ Full = 'F', //full populated
+ NotSymmetric = 'N' //not a symmetric matrix
+ };
+
+ enum class MatrixOpSide : char
+ {
+ Left = 'L', // left multiply
+ Right = 'R', // right multiply
+ };
+#pragma endregion Helpful Enum Definitions
+
+#pragma region Constructors and Destructor
+
+ //should only be used by constructors.
+ template
+ void CPUMatrix::ZeroInit()
+ {
+ m_computeDevice = CPUDEVICE;
+ m_pArray = nullptr;
+ m_numRows = 0;
+ m_numCols = 0;
+ m_elemSizeAllocated = 0;
+ m_matrixName=NULL;
+ m_format = matrixFormatDense;
+ m_externalBuffer = false;
+ }
+
+ template
+ CPUMatrix::CPUMatrix()
+ {
+ ZeroInit();
+ }
+
+ //matrixName is used to verify that correct matrix is read.
+ template
+ CPUMatrix::CPUMatrix(FILE* f, const char * matrixName)
+ {
+ ZeroInit();
+ ReadFromFile(f, matrixName);
+ }
+
+ // helper to allocate an array of ElemType
+ // Use this instead of new[] to get NaN initialization for debugging.
+ template
+ static ElemType * NewArray(size_t n)
+ {
+ ElemType * p = new ElemType[n]();
+#if 0//_DEBUG
+ ElemType nan = Matrix::MakeNan(__LINE__);
+ for (size_t i = 0; i < n; i++)
+ p[i] = nan;
+#endif
+ return p;
+ }
+
+ template
+ CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols)
+ {
+ ZeroInit();
+
+ m_numRows = numRows;
+ m_numCols = numCols;
+ m_elemSizeAllocated = GetNumElements();
+
+ if (m_elemSizeAllocated != 0)
+ m_pArray = NewArray(m_elemSizeAllocated);
+ }
+
+ template
+ CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags)
+ {
+ ZeroInit();
+ SetValue(numRows, numCols, pArray, matrixFlags);
+ }
+
+ //copy constructor, deep copy
+ template
+ CPUMatrix::CPUMatrix(const CPUMatrix& deepCopyFrom)
+ {
+ ZeroInit();
+ if (!deepCopyFrom.IsEmpty())
+ SetValue(deepCopyFrom);
+ SetMatrixName(deepCopyFrom.m_matrixName);
+ }
+
+ //assignment operator, deep copy
+ template
+ CPUMatrix& CPUMatrix::operator=(const CPUMatrix& deepCopyFrom)
+ {
+ Clear();
+ if (!deepCopyFrom.IsEmpty())
+ SetValue(deepCopyFrom);
+ SetMatrixName(deepCopyFrom.m_matrixName);
+ return *this;
+ }
+
+
+ //move constructor, shallow copy
+ template
+ CPUMatrix::CPUMatrix(CPUMatrix&& moveFrom)
+ {
+ m_computeDevice = moveFrom.m_computeDevice;
+ m_numRows = moveFrom.m_numRows;
+ m_numCols = moveFrom.m_numCols;
+ m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+ m_pArray = moveFrom.m_pArray; //shallow copy the pointer
+ m_matrixName = moveFrom.m_matrixName;
+ m_format = moveFrom.m_format;
+ m_externalBuffer = moveFrom.m_externalBuffer;
+ //release the pointer from the source object so that the destructor won't release it twice
+ moveFrom.ZeroInit();
+ }
+
+ //move assignment operator, shallow copy
+ template
+ CPUMatrix& CPUMatrix::operator=(CPUMatrix&& moveFrom)
+ {
+ if (this != &moveFrom)
+ {
+ if (OwnBuffer() && m_pArray != nullptr)
+ delete[] m_pArray; //always delete the data pointer since we will use the pointer from moveFrom
+
+ m_computeDevice = moveFrom.m_computeDevice;
+ m_numRows = moveFrom.m_numRows;
+ m_numCols = moveFrom.m_numCols;
+ m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
+ m_pArray = moveFrom.m_pArray;
+ m_format = moveFrom.m_format;
+ m_externalBuffer = moveFrom.m_externalBuffer;
+
+ //release the pointer from the source object so that the destructor won't release it twice
+ moveFrom.ZeroInit();
+ }
+ return *this;
+ }
+
+ template
+ CPUMatrix::~CPUMatrix()
+ {
+ Clear();
+ }
+
+ template
+ void CPUMatrix::Clear()
+ {
+ if (m_pArray!=nullptr && OwnBuffer())
+ {
+ delete [] m_pArray;
+ m_pArray = nullptr;
+ m_elemSizeAllocated = 0;
+ }
+ BaseMatrix::Clear();
+
+ ZeroInit();
+ }
+
+#pragma endregion Constructors and Destructor
+
+#pragma region Basic Operators
+
+ template
+ CPUMatrix CPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const
+ {
+ //if (numCols == 0)
+ // LogicError("The slice cannot have 0 columns.");
+
+ if (startColumn + numCols > m_numCols)
+ InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int)startColumn, (int)numCols, (int)m_numCols);
+
+ CPUMatrix slice;
+
+ slice.m_externalBuffer = true; //memory of a slice is managed externally.
+ slice.m_numRows = m_numRows;
+ slice.m_numCols = numCols;
+ slice.m_elemSizeAllocated = slice.GetNumElements();
+ slice.m_pArray = m_pArray + startColumn * m_numRows;
+ slice.m_format = m_format;
+
+ return slice;
+ }
+
+ // set this(:, 0:numCols-1) = fromMatrix(:, startColumn : startColumn+numCols-1)
+ // TODO: why not say *this = ColumnSlice()?
+ template
+ CPUMatrix& CPUMatrix::AssignColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols)
+ {
+ //if (numCols == 0)
+ // LogicError("The slice cannot have 0 columns.");
+
+ if (startColumn + numCols > fromMatrix.m_numCols)
+ InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int)startColumn, (int)numCols, (int)fromMatrix.m_numCols);
+
+ Clear();
+
+ SetOwnBuffer(false); //memory of a slice is managed externally.
+ m_numRows = fromMatrix.m_numRows;
+ m_numCols = numCols;
+ m_elemSizeAllocated = GetNumElements();
+ m_pArray = fromMatrix.m_pArray + startColumn *m_numRows;
+
+ return *this;
+ }
+
+ // set this(: , startColumn:startColumn+numCols-1)= fromMatrix;
+ template
+ CPUMatrix& CPUMatrix::SetColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols)
+ {
+ //if (numCols == 0)
+ // LogicError("The slice cannot have 0 columns.");
+ if (startColumn + numCols > m_numCols)
+ LogicError("The slice is out of range of the destination matrix.");
+ if (numCols > fromMatrix.GetNumCols())
+ InvalidArgument("The slice (%d) is out of range of the source matrix (%d).", (int)numCols, (int)fromMatrix.GetNumCols());
+ if (m_numRows != fromMatrix.m_numRows)
+ LogicError("The number of rows in source and destination matrices do not match");
+
+ //SetOwnBuffer(false);
+ memcpy(m_pArray + startColumn*m_numRows, fromMatrix.m_pArray, numCols*m_numRows*sizeof(ElemType));
+
+ return *this;
+ }
+
+ template
+ void CPUMatrix::CopyColumnsStrided(const CPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride)
+ {
+ if ((((numCols - 1) * srcNumColsStride) + 1) > fromMatrix.m_numCols)
+ LogicError("The numCols to copy and srcNumColsStride specified is out of range of the source matrix.");
+ if ((((numCols - 1) * destNumColsStride) + 1) > m_numCols)
+ LogicError("The numCols to copy and srcNumColsStride specified is out of range of the destination matrix.");
+ if (m_numRows != fromMatrix.m_numRows)
+ LogicError("The number of rows in source and destination matrices do not match");
+
+ long n = (long)numCols, m = (long)m_numRows;
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j = 0; j
+ CPUMatrix& CPUMatrix::AssignToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows)
+ {
+ if (a.GetNumRows() != numRows)
+ LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows.");
+
+ if (startIndex + numRows > GetNumRows())
+ LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows().");
+
+ if (a.GetNumCols() != GetNumCols())
+ LogicError("AddToRowSliceValuesOf: columns does not match.");
+
+ long n = (long)a.GetNumCols(), m = (long)numRows;
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j = 0; j
+ CPUMatrix& CPUMatrix::AssignRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows)
+ {
+ //if (a.IsEmpty())
+ // LogicError("AssignRowSliceValuesOf: input matrix a is empty.");
+
+ if (startIndex + numRows > a.GetNumRows())
+ LogicError("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
+
+ Resize(numRows, a.GetNumCols());
+
+ long n = (long)a.GetNumCols(); // note: OpenMP requires loop indices to be long, not size_t
+ long k = (long)a.GetNumRows();
+
+#pragma omp parallel for
+ for (long j=0; j
+ CPUMatrix& CPUMatrix::AddToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows)
+ {
+ if (a.IsEmpty())
+ LogicError("AddToRowSliceValuesOf: input matrix a is empty.");
+
+ if (a.GetNumRows() != numRows)
+ LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows.");
+
+ if (startIndex + numRows > GetNumRows())
+ LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows().");
+
+ if (a.GetNumCols() != GetNumCols())
+ LogicError("AddToRowSliceValuesOf: columns does not match.");
+
+ long n=(long)a.GetNumCols(), m=(long)numRows;
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j=0; j
+ CPUMatrix& CPUMatrix::AddWithRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows)
+ {
+ if (a.IsEmpty())
+ LogicError("AddWithRowSliceValuesOf: input matrix a is empty.");
+
+ if (GetNumRows() != numRows)
+ LogicError("AddWithRowSliceValuesOf: GetNumRows() != numRows.");
+
+ if (startIndex + numRows > a.GetNumRows())
+ LogicError("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows().");
+
+ if (a.GetNumCols() != GetNumCols())
+ LogicError("AddWithRowSliceValuesOf: columns does not match.");
+
+ long n = (long)a.GetNumCols(), m = (long)numRows;
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j = 0; j
+ CPUMatrix CPUMatrix::Diagonal() const
+ {
+ if (m_numRows != m_numCols)
+ LogicError("Diagonal can be called only for square matrix. (rows=%d, cols=%d)", m_numRows, m_numCols);
+
+ CPUMatrix diag(1, m_numCols);
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long i = 0; i < m_numRows; i++)
+ {
+ diag(0, (size_t)i) = us(i, i);
+ }
+
+ return diag;
+ }
+
+#if 0
+ //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+ template
+ CPUMatrix& CPUMatrix::AssignRowStackValuesOf(const std::vector*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+ {
+ if (sliceNumCols == 0)
+ LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
+
+ size_t totalRows = 0;
+ size_t* startRowIndeces = new size_t[inputMatrices.size()];
+ startRowIndeces[0] = 0;
+ for (int i = 0; i < inputMatrices.size(); i++)
+ {
+ const CPUMatrix& a = *inputMatrices[i];
+ if (a.IsEmpty())
+ LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
+
+ if (a.GetNumCols() < sliceStartCol + sliceNumCols)
+ LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
+
+ totalRows += a.GetNumRows();
+ if (iGetNumRows() * sizeof(ElemType));
+ }
+ }
+
+ delete [] startRowIndeces;
+
+ return *this;
+ }
+#endif
+
+ template
+ void CPUMatrix::MinusOneAt(CPUMatrix& c, const size_t position)
+ {
+ if (position < c.GetNumElements())
+ c.m_pArray[position] -= 1.0;
+ else
+ RuntimeError("MinusOneAt: position is out of CPU matrix size");
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignRepeatOf(const CPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats)
+ {
+ if (this == &a)
+ LogicError("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat.");
+
+ if (a.IsEmpty())
+ LogicError("AssignRepeatOf: Matrix a is empty.");
+
+ Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats);
+ long n = (long)a.GetNumCols(), m = (long)a.GetNumRows();
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long q = 0; q < numColRepeats; q++)
+ {
+ for (long p = 0; p < numRowRepeats; p++)
+ {
+ long colOffset = q*n;
+
+ for (long j = 0; j < n; j++, colOffset++)
+ {
+ long rowOffset = p*m;
+
+ //four-way unrolling
+ for (long i = 0; i < (m & ~3); i += 4, rowOffset += 4)
+ {
+ us(rowOffset, colOffset) = a(i, j);
+ us(rowOffset + 1, colOffset) = a(i + 1, j);
+ us(rowOffset + 2, colOffset) = a(i + 2, j);
+ us(rowOffset + 3, colOffset) = a(i + 3, j);
+ }
+ //handle remaining stuffs
+ for (long i = m & ~3; i < m; i++, rowOffset++)
+ {
+ us(rowOffset, colOffset) = a(i, j);
+ }
+ }
+ }
+ }
+
+ return *this;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AddToRowRepeatValuesOf(const CPUMatrix& a, const size_t numRepeats)
+ {
+ if (a.IsEmpty())
+ LogicError("AddToRowRepeatValuesOf: input matrix a is empty.");
+
+ if (a.GetNumRows() != GetNumRows() * numRepeats)
+ LogicError("AddToRowRepeatValuesOf: a.GetNumRows() != GetNumRows() * numRepeats.");
+
+ long n = (long)a.GetNumCols(), m = (long)GetNumRows();
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j = 0; j
+ CPUMatrix& CPUMatrix::AssignPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber)
+ {
+ a; posNumber; negNumber; shiftNumber;
+ NOT_IMPLEMENTED;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AddFoldedPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber)
+ {
+ a; posNumber; negNumber; shiftNumber;
+ NOT_IMPLEMENTED;
+ }
+
+ template
+ CPUMatrix CPUMatrix::Transpose()
+ {
+ if (IsEmpty())
+ LogicError("Transpose: Matrix is empty.");
+
+ CPUMatrix c;
+ c.AssignTransposeOf(*this);
+ return c;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignTransposeOf (const CPUMatrix& a)
+ {
+ if (this == &a)
+ LogicError("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose.");
+
+ if (a.IsEmpty())
+ LogicError("AssignTransposeOf: Matrix a is empty.");
+
+ Resize(a.GetNumCols(), a.GetNumRows());
+ long n=(long)a.GetNumCols(), m=(long)a.GetNumRows();
+
+ auto& us = *this;
+
+#pragma omp parallel for
+ for (long j=0; j
+ void CPUMatrix::SetValue(const ElemType v)
+ {
+ if (IsEmpty())
+ LogicError("SetValue: Matrix is empty.");
+ bool isFinite = std::numeric_limits::is_integer || std::isfinite((double)v);
+ if (isFinite && v == 0)
+ {
+ memset(m_pArray, 0, sizeof(ElemType) * GetNumElements());
+ }
+ else
+ {
+ long m=(long)GetNumElements();
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ m_pArray[i] = v;
+ m_pArray[i+1] = v;
+ m_pArray[i+2] = v;
+ m_pArray[i+3] = v;
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::MaskColumnsValue(const CPUMatrix& columnsMask, ElemType val)
+ {
+ if (GetNumCols() != columnsMask.GetNumCols())
+ RuntimeError("Matrix and column mask must have equal number of columns");
+
+ auto& us = *this;
+ long n = (long)GetNumCols(), m = (long)GetNumRows();
+#pragma omp parallel for
+ for (long j = 0; j
+ void CPUMatrix::SetColumn(const ElemType* colPointer, size_t j)
+ {
+ if (IsEmpty())
+ LogicError("SetColumn: Matrix is empty.");
+ if (colPointer==NULL)
+ return;
+
+ auto& us = *this;
+ long m=(long)GetNumRows();
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ us(i,j) = colPointer[i];
+ us(i+1,j) = colPointer[i+1];
+ us(i+2,j) = colPointer[i+2];
+ us(i+3,j) = colPointer[i+3];
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetColumn(const ElemType val, size_t j)
+ {
+ if (IsEmpty())
+ LogicError("SetColumn: Matrix is empty.");
+
+ auto& us = *this;
+ long m=(long)GetNumRows();
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ us(i,j) = val;
+ us(i+1,j) = val;
+ us(i+2,j) = val;
+ us(i+3,j) = val;
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetColumn(const CPUMatrix& valMat, size_t j)
+ {
+ if (IsEmpty())
+ LogicError("SetColumn: Matrix is empty.");
+ assert(valMat.GetNumRows() == GetNumRows() && valMat.GetNumCols() == 1) ;
+
+ auto& us = *this;
+ long m=(long)GetNumRows();
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ us(i,j) = valMat(i,0);
+ us(i+1,j) = valMat(i+1,0);
+ us(i+2,j) = valMat(i+2,0);
+ us(i+3,j) = valMat(i+3,0);
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetValue(const CPUMatrix& deepCopyFrom)
+ {
+ if (this == &deepCopyFrom)
+ return;
+
+ Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols());
+ memcpy(m_pArray, deepCopyFrom.m_pArray, deepCopyFrom.GetNumElements() * sizeof(ElemType));
+ }
+
+ template
+ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags)
+ {
+ if (pArray == nullptr)
+ InvalidArgument("Invalid pArray.");
+
+ m_format = matrixFormatDense;
+ m_computeDevice = CPUDEVICE;
+
+ // if it's externally managed, then populate the structure
+ if (matrixFlags&matrixFlagDontOwnBuffer)
+ {
+ // free previous array allocation if any before overwriting
+ if (m_pArray != nullptr)
+ delete [] m_pArray;
+
+ m_pArray = pArray;
+ m_numRows = numRows;
+ m_numCols = numCols;
+ m_elemSizeAllocated = GetNumElements();
+ m_externalBuffer = true;
+ }
+ else
+ {
+ Resize(numRows, numCols);
+
+ if (IsEmpty())
+ {
+ InvalidArgument("NumRows or NumCols is 0. Nothing to copy");
+ }
+ else
+ {
+ if (!(matrixFlags&matrixFormatRowMajor)) //compatible to internal structure
+ {
+ memcpy(m_pArray, pArray, GetNumElements()*sizeof(ElemType));
+ }
+ else //need to transpose
+ {
+ auto& us = *this;
+ if (sizeof(ElemType) == sizeof(double))
+ {
+ #pragma omp parallel for
+ foreach_column(j, us)
+ {
+ #ifndef USE_MKL
+ dcopy((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1);
+ #else
+ cblas_dcopy ((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1);
+ #endif
+ }
+ }
+ else
+ {
+ #pragma omp parallel for
+ foreach_column(j, us)
+ {
+ {
+ #pragma warning (suppress: 4244)
+ #ifndef USE_MKL
+ scopy((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1);
+ #else
+ cblas_scopy ((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1);
+ #endif
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ template
+ void CPUMatrix::SetDiagonalValue(const ElemType v)
+ {
+ if (IsEmpty())
+ LogicError("SetDiagonalValue: Matrix is empty.");
+
+ if (GetNumRows() != GetNumCols())
+ LogicError("SetDiagonalValue: NumRows and NumCols do not agree.");
+
+ auto& us = *this;
+ long m=(long)GetNumRows();
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ us(i,i) = v;
+ us(i+1,i+1) = v;
+ us(i+2,i+2) = v;
+ us(i+3,i+3) = v;
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetDiagonalValue(const CPUMatrix& vector)
+ {
+ if (IsEmpty() || vector.IsEmpty())
+ LogicError("SetDiagonalValue: Matrix is empty.");
+
+ if (GetNumRows() != GetNumCols())
+ LogicError("SetDiagonalValue: NumRows and NumCols do not agree.");
+
+ if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1)
+ LogicError("SetDiagonalValue: input vector must be a vector.");
+
+ if (vector.GetNumElements() == 1) //reduce to simple form
+ SetDiagonalValue(vector(0,0));
+ else if (vector.GetNumRows() != GetNumRows())
+ LogicError("SetDiagonalValue: input vector's dimension does not agree with [this].");
+ else
+ {
+ auto& us = *this;
+
+ long m=(long)GetNumRows();
+ if (vector.GetNumRows() == 1) //row vector
+ {
+#pragma omp parallel for
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ us(i,i) = vector(0, i);
+ us(i+1,i+1) = vector(0, i+1);
+ us(i+2,i+2) = vector(0, i+2);
+ us(i+3,i+3) = vector(0, i+3);
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
+ {
+ if (IsEmpty())
+ LogicError("SetUniformRandomValue: Matrix is empty.");
+
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+ std::ranlux64_base_01 generator;
+ generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
+#else
+ std::default_random_engine generator (seed);
+#endif
+ std::uniform_real_distribution r(low, high);
+
+ long m=(long)GetNumElements();
+ //four-way unrolling
+ for (long i=0; i<(m & ~3); i+=4)
+ {
+ m_pArray[i] = r(generator);
+ m_pArray[i+1] = r(generator);
+ m_pArray[i+2] = r(generator);
+ m_pArray[i+3] = r(generator);
+ }
+ //handle remaining stuffs
+ for (long i=m & ~3; i
+ void CPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
+ {
+ if (sigma <= 0)
+ InvalidArgument("SetUniformRandomValue: sigma must be a positive value.");
+
+ if (IsEmpty())
+ LogicError("SetUniformRandomValue: Matrix is empty.");
+
+ auto& us = *this;
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+ std::ranlux64_base_01 generator;
+ generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
+#else
+ std::default_random_engine generator (seed);
+#endif
+ std::normal_distribution r(mean, sigma);
+ //#pragma omp parallel for //is it thread safe?
+ foreach_coord(i,j,us)
+ {
+ us(i,j) = r(generator);
+ }
+ }
+
+ template
+ void CPUMatrix::AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
+ {
+ if (sigma <= 0)
+ InvalidArgument("SetUniformRandomValue: sigma must be a positive value.");
+
+ if (IsEmpty())
+ LogicError("SetUniformRandomValue: Matrix is empty.");
+
+ auto& us = *this;
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+ std::ranlux64_base_01 generator;
+ generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
+#else
+ std::default_random_engine generator (seed);
+#endif
+ std::normal_distribution r(mean, sigma);
+
+ long m=(long)GetNumRows(), n=(long)GetNumCols();
+ for (long j=0; j
+ void CPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
+ {
+ if (IsEmpty())
+ LogicError("SetUniformRandomValue: Matrix is empty.");
+
+ auto& us = *this;
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+ std::ranlux64_base_01 generator;
+ generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
+#else
+ std::default_random_engine generator (seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
+#endif
+ std::uniform_real_distribution r(0, 1);
+
+ long m=(long)GetNumRows(), n=(long)GetNumCols();
+ ElemType v;
+ for (long j=0; j
+ ElemType CPUMatrix::Adagrad(CPUMatrix& gradients, const bool needAveMultiplier)
+ {
+ ElemType aveMultiplier = 0;
+
+ if (IsEmpty() || gradients.GetNumCols() != GetNumCols() || gradients.GetNumRows() != GetNumRows())
+ {
+ Resize(gradients.GetNumRows(), gradients.GetNumCols());
+ SetValue(0.0);
+ }
+
+ assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols());
+
+ ElemType *a=m_pArray, *d_v=gradients.m_pArray;
+ size_t n = GetNumElements();
+
+ const ElemType floor = 1e-16f;
+ ElemType a0, a1, a2, a3;
+
+ //disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used.
+//#pragma omp parallel for
+ for (long i = 0; i<(n & ~3); i += 4) //four-way unrolling
+ {
+ a[i] += d_v[i] * d_v[i];
+ a[i+1] += d_v[i+1] * d_v[i+1];
+ a[i+2] += d_v[i+2] * d_v[i+2];
+ a[i+3] += d_v[i+3] * d_v[i+3];
+
+ a0 = sqrt(a[i] + floor);
+ a1 = sqrt(a[i + 1] + floor);
+ a2 = sqrt(a[i + 2] + floor);
+ a3 = sqrt(a[i + 3] + floor);
+
+ d_v[i] /= a0;
+ d_v[i+1] /= a1;
+ d_v[i+2] /= a2;
+ d_v[i+3] /= a3;
+
+ if (needAveMultiplier)
+ {
+ aveMultiplier += 1 / a0 + 1 / a1 + 1 / a2 + 1 / a3;
+ }
+ }
+
+ // get the last few elements if any
+ for (long i = n & ~3; i 0)
+ return aveMultiplier / n;
+ else
+ return 1;
+ }
+
+ template
+ void CPUMatrix::FSAdagrad(CPUMatrix& gradients,
+ CPUMatrix& functionValues,
+ ElemType learnRatePerSample,
+ ElemType momentum,
+ ElemType adaWeight,
+ ElemType adaMul)
+ {
+ size_t numColsNeeded = 2 * gradients.GetNumCols();
+
+ if (IsEmpty() || (GetNumCols() < numColsNeeded))
+ {
+ Resize(gradients.GetNumRows(), numColsNeeded);
+ SetValue(0.0);
+ }
+
+ assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
+
+ size_t n = gradients.GetNumElements();
+ ElemType* grad = gradients.m_pArray;
+ ElemType* smoothAda = m_pArray;
+ ElemType* smoothMom = m_pArray + n;
+ ElemType* val = functionValues.m_pArray;
+#pragma omp parallel for
+ // TODO: Unroll 4-times for better performance leveraging vectorization
+ for (long i = 0; i < n; i++)
+ {
+ ElemType g = grad[i];
+ ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g;
+ smoothAda[i] = adaSqr;
+ if (adaSqr != 0.0f)
+ {
+ ElemType ada = sqrt(adaSqr);
+ ElemType w = adaMul * ((ElemType)1.0 / ada);
+
+ if (w > 10.0f)
+ w = 10.0f;
+ g *= w;
+ }
+
+ if (momentum > 0.0f)
+ {
+ g = momentum * smoothMom[i] + (1.0f - momentum) * g;
+ smoothMom[i] = g;
+ }
+
+ g *= learnRatePerSample;
+ val[i] -= g;
+ }
+ }
+
+ template
+ ElemType CPUMatrix::RmsProp(CPUMatrix& gradients,
+ ElemType RMS_GAMMA,
+ ElemType RMS_WGT_INC,
+ ElemType RMS_WGT_MAX,
+ ElemType RMS_WGT_DEC,
+ ElemType RMS_WGT_MIN,
+ const bool needAveMultiplier
+ )
+ {
+ const ElemType floor = 1e-6f;
+
+ size_t n = gradients.GetNumElements();
+ ElemType *curr_grad=gradients.m_pArray;
+
+ if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3)
+ {
+ Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3);
+ SetValue(0.0);
+
+ ElemType *avars=m_pArray; // accumulated variances for RMS scaling
+ ElemType *steps=m_pArray+2*n; // current step size
+
+ // initialize moving average of gradient-squared
+ for( long i = 0; i < n; i++ )
+ avars[i] = curr_grad[i]*curr_grad[i];
+
+ // initialize starting step size
+ for( long i = 0; i < n; i++ )
+ steps[i] = ElemType(0.02);
+ }
+
+ ElemType *avars=m_pArray; // accumulated variances for RMS scaling
+ ElemType *signs=m_pArray+n; // sign of previous gradient
+ ElemType *steps=m_pArray+2*n; // current step size
+
+ assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3);
+
+ ElemType ONE_MINUS_GAMMA = ElemType(1.0) - RMS_GAMMA;
+ //int upd[] = {
+ // 2,2,0,
+ // 2,2,0,
+ // 1,1,1,
+ // 2,2,0,
+ // 1,2,1,
+ // 0,2,2,
+ // 1,1,1,
+ // 0,2,2,
+ // 0,2,2,
+ //};
+
+ // for (long i=0; ineg, 1->zero, 2->pos
+ // const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
+
+ // // signs[i] contains three consecutive grad_sign
+ // signs[i] = 3*(int(signs[i]) % 9) + grad_sign;
+
+ // switch(upd[int(signs[i])])
+ // {
+ // case 0:
+ // steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+ // break;
+ // case 2:
+ // steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+ // break;
+ // }
+ // curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
+ // }
+
+ ElemType aveMultiplier = 0, a;
+ for (long i=0; i 0 )
+ steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
+ else
+ steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
+
+ a = steps[i] / sqrt(avars[i] + floor);
+ curr_grad[i] *= a;
+ signs[i] = (ElemType)grad_sign;
+
+ if (needAveMultiplier)
+ aveMultiplier += a;
+ }
+
+ if (needAveMultiplier)
+ return aveMultiplier / n;
+ else
+ return 1;
+ }
+
+ template
+ void CPUMatrix::Reshape(const size_t numRows, const size_t numCols)
+ {
+ assert (numRows*numCols == GetNumElements());
+ if (numRows*numCols != GetNumElements())
+ InvalidArgument("Reshape: Total number of elements does not match.");
+
+ m_numRows = numRows;
+ m_numCols = numCols;
+ }
+
+ // Resize() -- change matrix size
+ // This function is cheap if the matrix size does not change.
+ // Current content is not preserved.
+ // BUGBUG: There is code that relies on zero initialization (without, we get subtle variations of output). That is wrong--we should initialize to QNaN and see where it fails.
+ // If growOnly is true, resize will not reallocate memory if the current memory is large enough (i.e., will not shrink).
+ // If this object does not own its memory then new memory cannot be allocated (one can still shrink and/or reshape).
+ template
+ void CPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly /*=true*/)
+ {
+ if (m_numRows == numRows && m_numCols == numCols)
+ return;
+
+ size_t numElements = numRows * numCols;
+ if (numElements > m_elemSizeAllocated || // grow allocation
+ (!growOnly && (numElements != m_elemSizeAllocated))) // shrink allocation (not if 'growOnly')
+ {
+ // reallocate buffer
+ ElemType * pArray = nullptr;
+ if (numElements > 0)
+ {
+ if (!OwnBuffer())
+ LogicError("Resize: Resizing an matrix you don't own is not supported.");
+ pArray = NewArray(numElements);
+ }
+ // success: update the object
+ if (OwnBuffer())
+ delete[] m_pArray;
+ else
+ assert(pArray == nullptr); // (if !OwnBuffer we can still resize to 0)
+ m_pArray = pArray;
+ m_elemSizeAllocated = numElements;
+ }
+
+ // success
+ m_numRows = numRows;
+ m_numCols = numCols;
+ }
+
+ // allocated by the callee but should be deleted by the caller
+ // TODO: change to use STL vector instead
+ template
+ ElemType* CPUMatrix::CopyToArray() const
+ {
+ size_t numElements = GetNumElements();
+ if (numElements != 0)
+ {
+ ElemType* arrayCopyTo = NewArray(numElements);
+ memcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements);
+ return arrayCopyTo;
+ }
+ else
+ {
+ return nullptr;
+ }
+ }
+
+ //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done
+ //return number of elements copied
+ template
+ size_t CPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const
+ {
+ size_t numElements = GetNumElements();
+
+ if (numElements > currentArraySize)
+ {
+ delete arrayCopyTo;
+ arrayCopyTo = NewArray(numElements);
+ currentArraySize = numElements;
+ }
+
+ if (numElements != 0)
+ {
+ memcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements);
+ }
+
+ return numElements;
+ }
+
+ template
+ void CPUMatrix::CopySection(size_t /*numRows*/, size_t /*numCols*/, ElemType* /*dst*/, size_t /*colStride*/) const
+ {
+ // REVIEW alexeyk: currently not used by CPU, but implement when possible.
+ RuntimeError("Not implemented.");
+ }
+
+ template
+ inline size_t CPUMatrix::LocateColumn(const size_t col) const
+ {
+ assert(col < m_numCols);
+ return col * m_numRows; // matrix in column-wise storage
+ }
+
+ template
+ inline size_t CPUMatrix::LocateElement (const size_t row, const size_t col) const
+ {
+ assert (row < m_numRows);
+ return LocateColumn(col) + row; // matrix in column-wise storage
+ }
+
+#pragma endregion Basic Operators
+
+#pragma region Member BLAS Functions
+
+ template
+ CPUMatrix& CPUMatrix::operator+= (ElemType alpha)
+ {
+ return AssignSumOf(alpha, *this);
+ }
+
+ template
+ CPUMatrix CPUMatrix::operator+ (ElemType alpha) const
+ {
+ CPUMatrix c(GetNumRows(), GetNumCols());
+ c.AssignSumOf(alpha, *this);
+ return c;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignSumOf(const ElemType alpha, const CPUMatrix& a)
+ {
+ if (a.IsEmpty())
+ LogicError("AssignSumOf: Matrix a is empty.");
+
+ auto& us=*this;
+ if (this != &a)
+ Resize(a.GetNumRows(), a.GetNumCols());
+
+ long m=(long)GetNumRows(), n=(long)GetNumCols();
+#pragma omp parallel for
+ for (long j=0; j
+ CPUMatrix& CPUMatrix::operator+= (const CPUMatrix& a)
+ {
+ //if (a.GetNumElements() == 1)
+ // *this += a(0,0);
+ //else
+ ScaleAndAdd(1, a, *this);
+
+ return *this;
+ }
+
+ //if [this] and a have same dimension then OUTPUT=[this]+a
+ //if a is a column vector, add to all columns of [this]
+ //if a is a row vector, add to all rows of [this]
+ template
+ CPUMatrix CPUMatrix::operator+ (const CPUMatrix& a) const
+ {
+ if (GetNumElements() == 1)
+ {
+ CPUMatrix c(a);
+ c += (*this)(0,0);
+ return c;
+ }
+ else if (a.GetNumElements() == 1)
+ {
+ CPUMatrix c(*this);
+ c += a(0,0);
+ return c;
+ }
+ else
+ {
+ CPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code
+ c += a;
+ return c;
+ }
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignSumOf(const CPUMatrix& a, const CPUMatrix& b)
+ {
+ if (a.GetNumElements() == 1)
+ {
+ SetValue(b);
+ (*this) += a;
+ }
+ else
+ {
+ SetValue(a);
+ (*this) += b;
+ }
+ return *this;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::operator-= (ElemType alpha)
+ {
+ return AssignDifferenceOf(*this, alpha);
+ }
+
+ template
+ CPUMatrix CPUMatrix::operator- (ElemType alpha) const
+ {
+ CPUMatrix c(GetNumRows(), GetNumCols());
+ c.AssignDifferenceOf(*this, alpha);
+ return c;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignDifferenceOf(const ElemType alpha, const CPUMatrix& a)
+ {
+ auto& us=*this;
+ if (this != &a)
+ Resize(a.GetNumRows(), a.GetNumCols());
+
+ long m=(long)GetNumRows(), n=(long)GetNumCols();
+#pragma omp parallel for
+ for (long j=0; j
+ CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const ElemType alpha)
+ {
+ auto& us=*this;
+ if (this != &a)
+ Resize(a.GetNumRows(), a.GetNumCols());
+
+ long m=(long)GetNumRows(), n=(long)GetNumCols();
+#pragma omp parallel for
+ for (long j=0; j
+ CPUMatrix& CPUMatrix::operator-= (const CPUMatrix& a)
+ {
+ ScaleAndAdd(-1, a, *this);
+
+ return *this;
+ }
+
+ //if [this] and a have same dimension then output=[this]-a
+ //if a is a column vector, minus it from all columns of [this]
+ //if a is a row vector, minus it from all rows of [this]
+ template
+ CPUMatrix CPUMatrix::operator- (const CPUMatrix& a) const
+ {
+ CPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code
+ c -= a;
+ return c;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const CPUMatrix& b)
+ {
+ if (this != &a)
+ {
+ Resize(a.GetNumRows(), a.GetNumCols());
+ SetValue(a);
+ }
+ (*this) -= b;
+ return *this;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::operator*= (ElemType alpha)
+ {
+ Scale(alpha, *this);
+ return *this;
+ }
+
+ template
+ CPUMatrix CPUMatrix::operator* (ElemType alpha) const
+ {
+ CPUMatrix c(GetNumRows(), GetNumCols());
+ Scale(alpha, *this, c);
+ return c;
+ }
+
+ template
+ CPUMatrix& CPUMatrix::AssignProductOf(const ElemType alpha, const CPUMatrix