diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp new file mode 100644 index 000000000000..35bfa8bb0e61 --- /dev/null +++ b/Math/Math/CPUMatrix.cpp @@ -0,0 +1,5473 @@ +// +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// +// CPUMatrix.cpp : full implementation of all matrix functions on the CPU side +// + +#include "stdafx.h" +#include "Basics.h" +#include "fileutil.h" + +#include +#include +#include +#include +#include "CPUMatrix.h" +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif +#include +#endif + +#ifdef LEAKDETECT +#include +#endif + +#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this +#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons + +#ifndef USE_MKL +// use ACML as default. +// Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above +// from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ +// Install the ifort64_mp variant (compiled with intel compiler) of the library +// Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml +// to point to your folder for the include file and link library +#include // requires ACML 5.3.1 and above +#else +// requires MKL 10.0 and above +#include +#endif + +#ifndef USE_MKL //MKL has one additional parameter for different matrix order +#define BLAS_COLMAJOR +#else +#define BLAS_COLMAJOR (int)MatrixOrder::ColMajor, +#endif + +#define SWAP(a,b) {(a) ^= (b); (b) ^= (a); (a) ^= (b);} +#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing +namespace Microsoft { namespace MSR { namespace CNTK { + +#pragma region Helpful Enum Definitions + enum class MatrixOrder + { + RowMajor = 101, // row-major arrays + ColMajor = 102 // column-major arrays + }; + + enum class MatrixTranspose : char + { + NoTrans = 'N', // trans='N' + Trans = 'T', // trans='T' + ConjTrans = 'C' // trans='C' + }; + + enum class SymMatrixType : char + { + Up = 'U', // symmetric matrix is stored in the upper part + Low = 'L', // symmetric matrix is stored in thelower part + Full = 'F', //full populated + NotSymmetric = 'N' //not a symmetric matrix + }; + + enum class MatrixOpSide : char + { + Left = 'L', // left multiply + Right = 'R', // right multiply + }; +#pragma endregion Helpful Enum Definitions + +#pragma region Constructors and Destructor + + //should only be used by constructors. + template + void CPUMatrix::ZeroInit() + { + m_computeDevice = CPUDEVICE; + m_pArray = nullptr; + m_numRows = 0; + m_numCols = 0; + m_elemSizeAllocated = 0; + m_matrixName=NULL; + m_format = matrixFormatDense; + m_externalBuffer = false; + } + + template + CPUMatrix::CPUMatrix() + { + ZeroInit(); + } + + //matrixName is used to verify that correct matrix is read. + template + CPUMatrix::CPUMatrix(FILE* f, const char * matrixName) + { + ZeroInit(); + ReadFromFile(f, matrixName); + } + + // helper to allocate an array of ElemType + // Use this instead of new[] to get NaN initialization for debugging. + template + static ElemType * NewArray(size_t n) + { + ElemType * p = new ElemType[n](); +#if 0//_DEBUG + ElemType nan = Matrix::MakeNan(__LINE__); + for (size_t i = 0; i < n; i++) + p[i] = nan; +#endif + return p; + } + + template + CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols) + { + ZeroInit(); + + m_numRows = numRows; + m_numCols = numCols; + m_elemSizeAllocated = GetNumElements(); + + if (m_elemSizeAllocated != 0) + m_pArray = NewArray(m_elemSizeAllocated); + } + + template + CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags) + { + ZeroInit(); + SetValue(numRows, numCols, pArray, matrixFlags); + } + + //copy constructor, deep copy + template + CPUMatrix::CPUMatrix(const CPUMatrix& deepCopyFrom) + { + ZeroInit(); + if (!deepCopyFrom.IsEmpty()) + SetValue(deepCopyFrom); + SetMatrixName(deepCopyFrom.m_matrixName); + } + + //assignment operator, deep copy + template + CPUMatrix& CPUMatrix::operator=(const CPUMatrix& deepCopyFrom) + { + Clear(); + if (!deepCopyFrom.IsEmpty()) + SetValue(deepCopyFrom); + SetMatrixName(deepCopyFrom.m_matrixName); + return *this; + } + + + //move constructor, shallow copy + template + CPUMatrix::CPUMatrix(CPUMatrix&& moveFrom) + { + m_computeDevice = moveFrom.m_computeDevice; + m_numRows = moveFrom.m_numRows; + m_numCols = moveFrom.m_numCols; + m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + m_pArray = moveFrom.m_pArray; //shallow copy the pointer + m_matrixName = moveFrom.m_matrixName; + m_format = moveFrom.m_format; + m_externalBuffer = moveFrom.m_externalBuffer; + //release the pointer from the source object so that the destructor won't release it twice + moveFrom.ZeroInit(); + } + + //move assignment operator, shallow copy + template + CPUMatrix& CPUMatrix::operator=(CPUMatrix&& moveFrom) + { + if (this != &moveFrom) + { + if (OwnBuffer() && m_pArray != nullptr) + delete[] m_pArray; //always delete the data pointer since we will use the pointer from moveFrom + + m_computeDevice = moveFrom.m_computeDevice; + m_numRows = moveFrom.m_numRows; + m_numCols = moveFrom.m_numCols; + m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + m_pArray = moveFrom.m_pArray; + m_format = moveFrom.m_format; + m_externalBuffer = moveFrom.m_externalBuffer; + + //release the pointer from the source object so that the destructor won't release it twice + moveFrom.ZeroInit(); + } + return *this; + } + + template + CPUMatrix::~CPUMatrix() + { + Clear(); + } + + template + void CPUMatrix::Clear() + { + if (m_pArray!=nullptr && OwnBuffer()) + { + delete [] m_pArray; + m_pArray = nullptr; + m_elemSizeAllocated = 0; + } + BaseMatrix::Clear(); + + ZeroInit(); + } + +#pragma endregion Constructors and Destructor + +#pragma region Basic Operators + + template + CPUMatrix CPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const + { + //if (numCols == 0) + // LogicError("The slice cannot have 0 columns."); + + if (startColumn + numCols > m_numCols) + InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int)startColumn, (int)numCols, (int)m_numCols); + + CPUMatrix slice; + + slice.m_externalBuffer = true; //memory of a slice is managed externally. + slice.m_numRows = m_numRows; + slice.m_numCols = numCols; + slice.m_elemSizeAllocated = slice.GetNumElements(); + slice.m_pArray = m_pArray + startColumn * m_numRows; + slice.m_format = m_format; + + return slice; + } + + // set this(:, 0:numCols-1) = fromMatrix(:, startColumn : startColumn+numCols-1) + // TODO: why not say *this = ColumnSlice()? + template + CPUMatrix& CPUMatrix::AssignColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols) + { + //if (numCols == 0) + // LogicError("The slice cannot have 0 columns."); + + if (startColumn + numCols > fromMatrix.m_numCols) + InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int)startColumn, (int)numCols, (int)fromMatrix.m_numCols); + + Clear(); + + SetOwnBuffer(false); //memory of a slice is managed externally. + m_numRows = fromMatrix.m_numRows; + m_numCols = numCols; + m_elemSizeAllocated = GetNumElements(); + m_pArray = fromMatrix.m_pArray + startColumn *m_numRows; + + return *this; + } + + // set this(: , startColumn:startColumn+numCols-1)= fromMatrix; + template + CPUMatrix& CPUMatrix::SetColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols) + { + //if (numCols == 0) + // LogicError("The slice cannot have 0 columns."); + if (startColumn + numCols > m_numCols) + LogicError("The slice is out of range of the destination matrix."); + if (numCols > fromMatrix.GetNumCols()) + InvalidArgument("The slice (%d) is out of range of the source matrix (%d).", (int)numCols, (int)fromMatrix.GetNumCols()); + if (m_numRows != fromMatrix.m_numRows) + LogicError("The number of rows in source and destination matrices do not match"); + + //SetOwnBuffer(false); + memcpy(m_pArray + startColumn*m_numRows, fromMatrix.m_pArray, numCols*m_numRows*sizeof(ElemType)); + + return *this; + } + + template + void CPUMatrix::CopyColumnsStrided(const CPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride) + { + if ((((numCols - 1) * srcNumColsStride) + 1) > fromMatrix.m_numCols) + LogicError("The numCols to copy and srcNumColsStride specified is out of range of the source matrix."); + if ((((numCols - 1) * destNumColsStride) + 1) > m_numCols) + LogicError("The numCols to copy and srcNumColsStride specified is out of range of the destination matrix."); + if (m_numRows != fromMatrix.m_numRows) + LogicError("The number of rows in source and destination matrices do not match"); + + long n = (long)numCols, m = (long)m_numRows; + + auto& us = *this; + +#pragma omp parallel for + for (long j = 0; j + CPUMatrix& CPUMatrix::AssignToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.GetNumRows() != numRows) + LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); + + if (startIndex + numRows > GetNumRows()) + LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); + + if (a.GetNumCols() != GetNumCols()) + LogicError("AddToRowSliceValuesOf: columns does not match."); + + long n = (long)a.GetNumCols(), m = (long)numRows; + + auto& us = *this; + +#pragma omp parallel for + for (long j = 0; j + CPUMatrix& CPUMatrix::AssignRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) + { + //if (a.IsEmpty()) + // LogicError("AssignRowSliceValuesOf: input matrix a is empty."); + + if (startIndex + numRows > a.GetNumRows()) + LogicError("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); + + Resize(numRows, a.GetNumCols()); + + long n = (long)a.GetNumCols(); // note: OpenMP requires loop indices to be long, not size_t + long k = (long)a.GetNumRows(); + +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::AddToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.IsEmpty()) + LogicError("AddToRowSliceValuesOf: input matrix a is empty."); + + if (a.GetNumRows() != numRows) + LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); + + if (startIndex + numRows > GetNumRows()) + LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); + + if (a.GetNumCols() != GetNumCols()) + LogicError("AddToRowSliceValuesOf: columns does not match."); + + long n=(long)a.GetNumCols(), m=(long)numRows; + + auto& us = *this; + +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::AddWithRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) + { + if (a.IsEmpty()) + LogicError("AddWithRowSliceValuesOf: input matrix a is empty."); + + if (GetNumRows() != numRows) + LogicError("AddWithRowSliceValuesOf: GetNumRows() != numRows."); + + if (startIndex + numRows > a.GetNumRows()) + LogicError("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); + + if (a.GetNumCols() != GetNumCols()) + LogicError("AddWithRowSliceValuesOf: columns does not match."); + + long n = (long)a.GetNumCols(), m = (long)numRows; + + auto& us = *this; + +#pragma omp parallel for + for (long j = 0; j + CPUMatrix CPUMatrix::Diagonal() const + { + if (m_numRows != m_numCols) + LogicError("Diagonal can be called only for square matrix. (rows=%d, cols=%d)", m_numRows, m_numCols); + + CPUMatrix diag(1, m_numCols); + + auto& us = *this; + +#pragma omp parallel for + for (long i = 0; i < m_numRows; i++) + { + diag(0, (size_t)i) = us(i, i); + } + + return diag; + } + +#if 0 + //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object. + template + CPUMatrix& CPUMatrix::AssignRowStackValuesOf(const std::vector*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) + { + if (sliceNumCols == 0) + LogicError("AssignRowStackValuesOf: sliceNumCols should > 0."); + + size_t totalRows = 0; + size_t* startRowIndeces = new size_t[inputMatrices.size()]; + startRowIndeces[0] = 0; + for (int i = 0; i < inputMatrices.size(); i++) + { + const CPUMatrix& a = *inputMatrices[i]; + if (a.IsEmpty()) + LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i); + + if (a.GetNumCols() < sliceStartCol + sliceNumCols) + LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i); + + totalRows += a.GetNumRows(); + if (iGetNumRows() * sizeof(ElemType)); + } + } + + delete [] startRowIndeces; + + return *this; + } +#endif + + template + void CPUMatrix::MinusOneAt(CPUMatrix& c, const size_t position) + { + if (position < c.GetNumElements()) + c.m_pArray[position] -= 1.0; + else + RuntimeError("MinusOneAt: position is out of CPU matrix size"); + } + + template + CPUMatrix& CPUMatrix::AssignRepeatOf(const CPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) + { + if (this == &a) + LogicError("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); + + if (a.IsEmpty()) + LogicError("AssignRepeatOf: Matrix a is empty."); + + Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); + long n = (long)a.GetNumCols(), m = (long)a.GetNumRows(); + auto& us = *this; + +#pragma omp parallel for + for (long q = 0; q < numColRepeats; q++) + { + for (long p = 0; p < numRowRepeats; p++) + { + long colOffset = q*n; + + for (long j = 0; j < n; j++, colOffset++) + { + long rowOffset = p*m; + + //four-way unrolling + for (long i = 0; i < (m & ~3); i += 4, rowOffset += 4) + { + us(rowOffset, colOffset) = a(i, j); + us(rowOffset + 1, colOffset) = a(i + 1, j); + us(rowOffset + 2, colOffset) = a(i + 2, j); + us(rowOffset + 3, colOffset) = a(i + 3, j); + } + //handle remaining stuffs + for (long i = m & ~3; i < m; i++, rowOffset++) + { + us(rowOffset, colOffset) = a(i, j); + } + } + } + } + + return *this; + } + + template + CPUMatrix& CPUMatrix::AddToRowRepeatValuesOf(const CPUMatrix& a, const size_t numRepeats) + { + if (a.IsEmpty()) + LogicError("AddToRowRepeatValuesOf: input matrix a is empty."); + + if (a.GetNumRows() != GetNumRows() * numRepeats) + LogicError("AddToRowRepeatValuesOf: a.GetNumRows() != GetNumRows() * numRepeats."); + + long n = (long)a.GetNumCols(), m = (long)GetNumRows(); + + auto& us = *this; + +#pragma omp parallel for + for (long j = 0; j + CPUMatrix& CPUMatrix::AssignPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) + { + a; posNumber; negNumber; shiftNumber; + NOT_IMPLEMENTED; + } + + template + CPUMatrix& CPUMatrix::AddFoldedPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) + { + a; posNumber; negNumber; shiftNumber; + NOT_IMPLEMENTED; + } + + template + CPUMatrix CPUMatrix::Transpose() + { + if (IsEmpty()) + LogicError("Transpose: Matrix is empty."); + + CPUMatrix c; + c.AssignTransposeOf(*this); + return c; + } + + template + CPUMatrix& CPUMatrix::AssignTransposeOf (const CPUMatrix& a) + { + if (this == &a) + LogicError("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); + + if (a.IsEmpty()) + LogicError("AssignTransposeOf: Matrix a is empty."); + + Resize(a.GetNumCols(), a.GetNumRows()); + long n=(long)a.GetNumCols(), m=(long)a.GetNumRows(); + + auto& us = *this; + +#pragma omp parallel for + for (long j=0; j + void CPUMatrix::SetValue(const ElemType v) + { + if (IsEmpty()) + LogicError("SetValue: Matrix is empty."); + bool isFinite = std::numeric_limits::is_integer || std::isfinite((double)v); + if (isFinite && v == 0) + { + memset(m_pArray, 0, sizeof(ElemType) * GetNumElements()); + } + else + { + long m=(long)GetNumElements(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + m_pArray[i] = v; + m_pArray[i+1] = v; + m_pArray[i+2] = v; + m_pArray[i+3] = v; + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::MaskColumnsValue(const CPUMatrix& columnsMask, ElemType val) + { + if (GetNumCols() != columnsMask.GetNumCols()) + RuntimeError("Matrix and column mask must have equal number of columns"); + + auto& us = *this; + long n = (long)GetNumCols(), m = (long)GetNumRows(); +#pragma omp parallel for + for (long j = 0; j + void CPUMatrix::SetColumn(const ElemType* colPointer, size_t j) + { + if (IsEmpty()) + LogicError("SetColumn: Matrix is empty."); + if (colPointer==NULL) + return; + + auto& us = *this; + long m=(long)GetNumRows(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + us(i,j) = colPointer[i]; + us(i+1,j) = colPointer[i+1]; + us(i+2,j) = colPointer[i+2]; + us(i+3,j) = colPointer[i+3]; + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetColumn(const ElemType val, size_t j) + { + if (IsEmpty()) + LogicError("SetColumn: Matrix is empty."); + + auto& us = *this; + long m=(long)GetNumRows(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + us(i,j) = val; + us(i+1,j) = val; + us(i+2,j) = val; + us(i+3,j) = val; + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetColumn(const CPUMatrix& valMat, size_t j) + { + if (IsEmpty()) + LogicError("SetColumn: Matrix is empty."); + assert(valMat.GetNumRows() == GetNumRows() && valMat.GetNumCols() == 1) ; + + auto& us = *this; + long m=(long)GetNumRows(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + us(i,j) = valMat(i,0); + us(i+1,j) = valMat(i+1,0); + us(i+2,j) = valMat(i+2,0); + us(i+3,j) = valMat(i+3,0); + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetValue(const CPUMatrix& deepCopyFrom) + { + if (this == &deepCopyFrom) + return; + + Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols()); + memcpy(m_pArray, deepCopyFrom.m_pArray, deepCopyFrom.GetNumElements() * sizeof(ElemType)); + } + + template + void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType *pArray, const size_t matrixFlags) + { + if (pArray == nullptr) + InvalidArgument("Invalid pArray."); + + m_format = matrixFormatDense; + m_computeDevice = CPUDEVICE; + + // if it's externally managed, then populate the structure + if (matrixFlags&matrixFlagDontOwnBuffer) + { + // free previous array allocation if any before overwriting + if (m_pArray != nullptr) + delete [] m_pArray; + + m_pArray = pArray; + m_numRows = numRows; + m_numCols = numCols; + m_elemSizeAllocated = GetNumElements(); + m_externalBuffer = true; + } + else + { + Resize(numRows, numCols); + + if (IsEmpty()) + { + InvalidArgument("NumRows or NumCols is 0. Nothing to copy"); + } + else + { + if (!(matrixFlags&matrixFormatRowMajor)) //compatible to internal structure + { + memcpy(m_pArray, pArray, GetNumElements()*sizeof(ElemType)); + } + else //need to transpose + { + auto& us = *this; + if (sizeof(ElemType) == sizeof(double)) + { + #pragma omp parallel for + foreach_column(j, us) + { + #ifndef USE_MKL + dcopy((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1); + #else + cblas_dcopy ((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1); + #endif + } + } + else + { + #pragma omp parallel for + foreach_column(j, us) + { + { + #pragma warning (suppress: 4244) + #ifndef USE_MKL + scopy((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1); + #else + cblas_scopy ((int)numRows, reinterpret_cast (pArray+j), (int)numCols, reinterpret_cast (m_pArray + LocateColumn(j)), 1); + #endif + } + } + } + } + } + } + } + + template + void CPUMatrix::SetDiagonalValue(const ElemType v) + { + if (IsEmpty()) + LogicError("SetDiagonalValue: Matrix is empty."); + + if (GetNumRows() != GetNumCols()) + LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); + + auto& us = *this; + long m=(long)GetNumRows(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + us(i,i) = v; + us(i+1,i+1) = v; + us(i+2,i+2) = v; + us(i+3,i+3) = v; + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetDiagonalValue(const CPUMatrix& vector) + { + if (IsEmpty() || vector.IsEmpty()) + LogicError("SetDiagonalValue: Matrix is empty."); + + if (GetNumRows() != GetNumCols()) + LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); + + if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) + LogicError("SetDiagonalValue: input vector must be a vector."); + + if (vector.GetNumElements() == 1) //reduce to simple form + SetDiagonalValue(vector(0,0)); + else if (vector.GetNumRows() != GetNumRows()) + LogicError("SetDiagonalValue: input vector's dimension does not agree with [this]."); + else + { + auto& us = *this; + + long m=(long)GetNumRows(); + if (vector.GetNumRows() == 1) //row vector + { +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + us(i,i) = vector(0, i); + us(i+1,i+1) = vector(0, i+1); + us(i+2,i+2) = vector(0, i+2); + us(i+3,i+3) = vector(0, i+3); + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) + { + if (IsEmpty()) + LogicError("SetUniformRandomValue: Matrix is empty."); + +#ifdef _MSC_VER // TODO: check if available under GCC/Linux + std::ranlux64_base_01 generator; + generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); +#else + std::default_random_engine generator (seed); +#endif + std::uniform_real_distribution r(low, high); + + long m=(long)GetNumElements(); + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + m_pArray[i] = r(generator); + m_pArray[i+1] = r(generator); + m_pArray[i+2] = r(generator); + m_pArray[i+3] = r(generator); + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) + { + if (sigma <= 0) + InvalidArgument("SetUniformRandomValue: sigma must be a positive value."); + + if (IsEmpty()) + LogicError("SetUniformRandomValue: Matrix is empty."); + + auto& us = *this; +#ifdef _MSC_VER // TODO: check if available under GCC/Linux + std::ranlux64_base_01 generator; + generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); +#else + std::default_random_engine generator (seed); +#endif + std::normal_distribution r(mean, sigma); + //#pragma omp parallel for //is it thread safe? + foreach_coord(i,j,us) + { + us(i,j) = r(generator); + } + } + + template + void CPUMatrix::AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) + { + if (sigma <= 0) + InvalidArgument("SetUniformRandomValue: sigma must be a positive value."); + + if (IsEmpty()) + LogicError("SetUniformRandomValue: Matrix is empty."); + + auto& us = *this; +#ifdef _MSC_VER // TODO: check if available under GCC/Linux + std::ranlux64_base_01 generator; + generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); +#else + std::default_random_engine generator (seed); +#endif + std::normal_distribution r(mean, sigma); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); + for (long j=0; j + void CPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) + { + if (IsEmpty()) + LogicError("SetUniformRandomValue: Matrix is empty."); + + auto& us = *this; +#ifdef _MSC_VER // TODO: check if available under GCC/Linux + std::ranlux64_base_01 generator; + generator.seed(seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); +#else + std::default_random_engine generator (seed==USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); +#endif + std::uniform_real_distribution r(0, 1); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); + ElemType v; + for (long j=0; j + ElemType CPUMatrix::Adagrad(CPUMatrix& gradients, const bool needAveMultiplier) + { + ElemType aveMultiplier = 0; + + if (IsEmpty() || gradients.GetNumCols() != GetNumCols() || gradients.GetNumRows() != GetNumRows()) + { + Resize(gradients.GetNumRows(), gradients.GetNumCols()); + SetValue(0.0); + } + + assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols()); + + ElemType *a=m_pArray, *d_v=gradients.m_pArray; + size_t n = GetNumElements(); + + const ElemType floor = 1e-16f; + ElemType a0, a1, a2, a3; + + //disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used. +//#pragma omp parallel for + for (long i = 0; i<(n & ~3); i += 4) //four-way unrolling + { + a[i] += d_v[i] * d_v[i]; + a[i+1] += d_v[i+1] * d_v[i+1]; + a[i+2] += d_v[i+2] * d_v[i+2]; + a[i+3] += d_v[i+3] * d_v[i+3]; + + a0 = sqrt(a[i] + floor); + a1 = sqrt(a[i + 1] + floor); + a2 = sqrt(a[i + 2] + floor); + a3 = sqrt(a[i + 3] + floor); + + d_v[i] /= a0; + d_v[i+1] /= a1; + d_v[i+2] /= a2; + d_v[i+3] /= a3; + + if (needAveMultiplier) + { + aveMultiplier += 1 / a0 + 1 / a1 + 1 / a2 + 1 / a3; + } + } + + // get the last few elements if any + for (long i = n & ~3; i 0) + return aveMultiplier / n; + else + return 1; + } + + template + void CPUMatrix::FSAdagrad(CPUMatrix& gradients, + CPUMatrix& functionValues, + ElemType learnRatePerSample, + ElemType momentum, + ElemType adaWeight, + ElemType adaMul) + { + size_t numColsNeeded = 2 * gradients.GetNumCols(); + + if (IsEmpty() || (GetNumCols() < numColsNeeded)) + { + Resize(gradients.GetNumRows(), numColsNeeded); + SetValue(0.0); + } + + assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); + + size_t n = gradients.GetNumElements(); + ElemType* grad = gradients.m_pArray; + ElemType* smoothAda = m_pArray; + ElemType* smoothMom = m_pArray + n; + ElemType* val = functionValues.m_pArray; +#pragma omp parallel for + // TODO: Unroll 4-times for better performance leveraging vectorization + for (long i = 0; i < n; i++) + { + ElemType g = grad[i]; + ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g; + smoothAda[i] = adaSqr; + if (adaSqr != 0.0f) + { + ElemType ada = sqrt(adaSqr); + ElemType w = adaMul * ((ElemType)1.0 / ada); + + if (w > 10.0f) + w = 10.0f; + g *= w; + } + + if (momentum > 0.0f) + { + g = momentum * smoothMom[i] + (1.0f - momentum) * g; + smoothMom[i] = g; + } + + g *= learnRatePerSample; + val[i] -= g; + } + } + + template + ElemType CPUMatrix::RmsProp(CPUMatrix& gradients, + ElemType RMS_GAMMA, + ElemType RMS_WGT_INC, + ElemType RMS_WGT_MAX, + ElemType RMS_WGT_DEC, + ElemType RMS_WGT_MIN, + const bool needAveMultiplier + ) + { + const ElemType floor = 1e-6f; + + size_t n = gradients.GetNumElements(); + ElemType *curr_grad=gradients.m_pArray; + + if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3) + { + Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3); + SetValue(0.0); + + ElemType *avars=m_pArray; // accumulated variances for RMS scaling + ElemType *steps=m_pArray+2*n; // current step size + + // initialize moving average of gradient-squared + for( long i = 0; i < n; i++ ) + avars[i] = curr_grad[i]*curr_grad[i]; + + // initialize starting step size + for( long i = 0; i < n; i++ ) + steps[i] = ElemType(0.02); + } + + ElemType *avars=m_pArray; // accumulated variances for RMS scaling + ElemType *signs=m_pArray+n; // sign of previous gradient + ElemType *steps=m_pArray+2*n; // current step size + + assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3); + + ElemType ONE_MINUS_GAMMA = ElemType(1.0) - RMS_GAMMA; + //int upd[] = { + // 2,2,0, + // 2,2,0, + // 1,1,1, + // 2,2,0, + // 1,2,1, + // 0,2,2, + // 1,1,1, + // 0,2,2, + // 0,2,2, + //}; + + // for (long i=0; ineg, 1->zero, 2->pos + // const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); + + // // signs[i] contains three consecutive grad_sign + // signs[i] = 3*(int(signs[i]) % 9) + grad_sign; + + // switch(upd[int(signs[i])]) + // { + // case 0: + // steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); + // break; + // case 2: + // steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); + // break; + // } + // curr_grad[i] *= steps[i] / sqrt(avars[i] + floor); + // } + + ElemType aveMultiplier = 0, a; + for (long i=0; i 0 ) + steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); + else + steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); + + a = steps[i] / sqrt(avars[i] + floor); + curr_grad[i] *= a; + signs[i] = (ElemType)grad_sign; + + if (needAveMultiplier) + aveMultiplier += a; + } + + if (needAveMultiplier) + return aveMultiplier / n; + else + return 1; + } + + template + void CPUMatrix::Reshape(const size_t numRows, const size_t numCols) + { + assert (numRows*numCols == GetNumElements()); + if (numRows*numCols != GetNumElements()) + InvalidArgument("Reshape: Total number of elements does not match."); + + m_numRows = numRows; + m_numCols = numCols; + } + + // Resize() -- change matrix size + // This function is cheap if the matrix size does not change. + // Current content is not preserved. + // BUGBUG: There is code that relies on zero initialization (without, we get subtle variations of output). That is wrong--we should initialize to QNaN and see where it fails. + // If growOnly is true, resize will not reallocate memory if the current memory is large enough (i.e., will not shrink). + // If this object does not own its memory then new memory cannot be allocated (one can still shrink and/or reshape). + template + void CPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly /*=true*/) + { + if (m_numRows == numRows && m_numCols == numCols) + return; + + size_t numElements = numRows * numCols; + if (numElements > m_elemSizeAllocated || // grow allocation + (!growOnly && (numElements != m_elemSizeAllocated))) // shrink allocation (not if 'growOnly') + { + // reallocate buffer + ElemType * pArray = nullptr; + if (numElements > 0) + { + if (!OwnBuffer()) + LogicError("Resize: Resizing an matrix you don't own is not supported."); + pArray = NewArray(numElements); + } + // success: update the object + if (OwnBuffer()) + delete[] m_pArray; + else + assert(pArray == nullptr); // (if !OwnBuffer we can still resize to 0) + m_pArray = pArray; + m_elemSizeAllocated = numElements; + } + + // success + m_numRows = numRows; + m_numCols = numCols; + } + + // allocated by the callee but should be deleted by the caller + // TODO: change to use STL vector instead + template + ElemType* CPUMatrix::CopyToArray() const + { + size_t numElements = GetNumElements(); + if (numElements != 0) + { + ElemType* arrayCopyTo = NewArray(numElements); + memcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements); + return arrayCopyTo; + } + else + { + return nullptr; + } + } + + //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done + //return number of elements copied + template + size_t CPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const + { + size_t numElements = GetNumElements(); + + if (numElements > currentArraySize) + { + delete arrayCopyTo; + arrayCopyTo = NewArray(numElements); + currentArraySize = numElements; + } + + if (numElements != 0) + { + memcpy(arrayCopyTo, m_pArray, sizeof(ElemType)*numElements); + } + + return numElements; + } + + template + void CPUMatrix::CopySection(size_t /*numRows*/, size_t /*numCols*/, ElemType* /*dst*/, size_t /*colStride*/) const + { + // REVIEW alexeyk: currently not used by CPU, but implement when possible. + RuntimeError("Not implemented."); + } + + template + inline size_t CPUMatrix::LocateColumn(const size_t col) const + { + assert(col < m_numCols); + return col * m_numRows; // matrix in column-wise storage + } + + template + inline size_t CPUMatrix::LocateElement (const size_t row, const size_t col) const + { + assert (row < m_numRows); + return LocateColumn(col) + row; // matrix in column-wise storage + } + +#pragma endregion Basic Operators + +#pragma region Member BLAS Functions + + template + CPUMatrix& CPUMatrix::operator+= (ElemType alpha) + { + return AssignSumOf(alpha, *this); + } + + template + CPUMatrix CPUMatrix::operator+ (ElemType alpha) const + { + CPUMatrix c(GetNumRows(), GetNumCols()); + c.AssignSumOf(alpha, *this); + return c; + } + + template + CPUMatrix& CPUMatrix::AssignSumOf(const ElemType alpha, const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSumOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::operator+= (const CPUMatrix& a) + { + //if (a.GetNumElements() == 1) + // *this += a(0,0); + //else + ScaleAndAdd(1, a, *this); + + return *this; + } + + //if [this] and a have same dimension then OUTPUT=[this]+a + //if a is a column vector, add to all columns of [this] + //if a is a row vector, add to all rows of [this] + template + CPUMatrix CPUMatrix::operator+ (const CPUMatrix& a) const + { + if (GetNumElements() == 1) + { + CPUMatrix c(a); + c += (*this)(0,0); + return c; + } + else if (a.GetNumElements() == 1) + { + CPUMatrix c(*this); + c += a(0,0); + return c; + } + else + { + CPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code + c += a; + return c; + } + } + + template + CPUMatrix& CPUMatrix::AssignSumOf(const CPUMatrix& a, const CPUMatrix& b) + { + if (a.GetNumElements() == 1) + { + SetValue(b); + (*this) += a; + } + else + { + SetValue(a); + (*this) += b; + } + return *this; + } + + template + CPUMatrix& CPUMatrix::operator-= (ElemType alpha) + { + return AssignDifferenceOf(*this, alpha); + } + + template + CPUMatrix CPUMatrix::operator- (ElemType alpha) const + { + CPUMatrix c(GetNumRows(), GetNumCols()); + c.AssignDifferenceOf(*this, alpha); + return c; + } + + template + CPUMatrix& CPUMatrix::AssignDifferenceOf(const ElemType alpha, const CPUMatrix& a) + { + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const ElemType alpha) + { + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::operator-= (const CPUMatrix& a) + { + ScaleAndAdd(-1, a, *this); + + return *this; + } + + //if [this] and a have same dimension then output=[this]-a + //if a is a column vector, minus it from all columns of [this] + //if a is a row vector, minus it from all rows of [this] + template + CPUMatrix CPUMatrix::operator- (const CPUMatrix& a) const + { + CPUMatrix c(*this); //this implementation will introduce a copy overhead. but make resue of the code + c -= a; + return c; + } + + template + CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const CPUMatrix& b) + { + if (this != &a) + { + Resize(a.GetNumRows(), a.GetNumCols()); + SetValue(a); + } + (*this) -= b; + return *this; + } + + template + CPUMatrix& CPUMatrix::operator*= (ElemType alpha) + { + Scale(alpha, *this); + return *this; + } + + template + CPUMatrix CPUMatrix::operator* (ElemType alpha) const + { + CPUMatrix c(GetNumRows(), GetNumCols()); + Scale(alpha, *this, c); + return c; + } + + template + CPUMatrix& CPUMatrix::AssignProductOf(const ElemType alpha, const CPUMatrix& a) + { + Scale(alpha, a, *this); + return *this; + } + + // [this]=a*b + template + CPUMatrix& CPUMatrix::AssignProductOf (const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB) + { + if (a.GetNumElements() == 1) + { + if (transposeB) + AssignTransposeOf(b); + (*this) *= a(0,0); + } + else if (b.GetNumElements() == 1) + { + if (transposeA) + AssignTransposeOf(a); + (*this) *= b(0,0); + } + else + Multiply(a, transposeA, b, transposeB, *this); + + return *this; + } + + template + CPUMatrix CPUMatrix::operator* (const CPUMatrix& a) const + { + auto& us = *this; + if (GetNumElements() == 1) + { + CPUMatrix c; + c.AssignProductOf(us(0,0), a); + return c; + } + else if (a.GetNumElements() == 1) + { + CPUMatrix c; + c.AssignProductOf(a(0,0), us); + return c; + } + else + { + CPUMatrix c; + Multiply(*this, a, c); + return c; + } + } + + template + CPUMatrix& CPUMatrix::operator/= (ElemType alpha) + { + (*this) *= 1/alpha; + return (*this); + } + + template + CPUMatrix CPUMatrix::operator/ (ElemType alpha) const + { + return ((*this) * (1/alpha)); + } + + //element-wise power + template + CPUMatrix& CPUMatrix::operator^= (ElemType alpha) + { + auto& us = *this; + ElementWisePower(alpha, us, us); + return us; + } + + //element-wise power + template + CPUMatrix CPUMatrix::operator^ (ElemType alpha) const + { + CPUMatrix c(GetNumRows(), GetNumCols()); + ElementWisePower(alpha, *this, c); + return c; + } + + + template + CPUMatrix& CPUMatrix::AssignElementPowerOf(const CPUMatrix& a, const ElemType power) + { + ElementWisePower(power, a, *this); + return *this; + } + + //[this]=[this] .* a (we cannot override operator .* in c++) + template + CPUMatrix& CPUMatrix::ElementMultiplyWith (const CPUMatrix& a) + { + return AssignElementProductOf(*this, a); + } + + //[this]=[this] .* a (we cannot override operator .* in c++) + template + CPUMatrix& CPUMatrix::ElementDivideBy(const CPUMatrix& a) + { + return AssignElementDivisionOf(*this, a); + } + + //[this]=a .* b + template + CPUMatrix& CPUMatrix::AssignElementProductOf (const CPUMatrix& a, const CPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementProductOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("AssignElementProductOf: The input matrix dimensions do not match."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::AddElementProductOf (const CPUMatrix& a, const CPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AddElementProductOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("AddElementProductOf : The input matrix dimensions do not match."); + + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols())) + InvalidArgument("AddElementProductOf : The input matrix dimensions do not match [this]."); + + auto& us=*this; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::AssignElementDivisionOf (const CPUMatrix& a, const CPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementDivisionOf: Matrix is empty."); + + assert (a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("AssignElementDivisionOf : The input matrix dimensions do not match."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + ElemType smallValue = EPS_IN_INVERSE; + +#pragma omp parallel for + foreach_coord(i,j,us) + { + ElemType v = b(i,j); + if (v >= 0 && v < smallValue) + us(i,j) = a(i,j) / smallValue; + else if (v < 0 && v > -smallValue) + us(i,j) = a(i,j) / (-smallValue); + else + us(i,j) = a(i,j) / v; + } + + return *this; + } + + template + CPUMatrix& CPUMatrix::ColumnElementMultiplyWith(const CPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + LogicError("ColumnElementMultiplyWith: Matrix is empty."); + + assert (a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1); + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) + InvalidArgument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows."); + + auto& us=*this; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::RowElementMultiplyWith(const CPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + LogicError("RowElementMultiplyWith: Matrix is empty."); + + assert (a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()); + if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) + InvalidArgument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns."); + + auto& us=*this; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::RowElementDivideBy(const CPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + LogicError("RowElementDivideBy: Matrix is empty."); + + assert(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()); + if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) + InvalidArgument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns."); + + auto& us = *this; + + long m = (long)GetNumRows(), n = (long)GetNumCols(); +#pragma omp parallel for + for (long j = 0; j= 0 && v < EPS_IN_INVERSE) + v = EPS_IN_INVERSE; + else if (v < 0 && v > -EPS_IN_INVERSE) + v = (-EPS_IN_INVERSE); + + //four-way unrolling + for (long i = 0; i<(m & ~3); i += 4) + { + us(i, j) /= v; + us(i + 1, j) /= v; + us(i + 2, j) /= v; + us(i + 3, j) /= v; + } + //handle remaining stuffs + for (long i = m & ~3; i + CPUMatrix& CPUMatrix::ColumnElementDivideBy(const CPUMatrix& a) + { + if (a.IsEmpty() || IsEmpty()) + LogicError("ColumnElementDivideBy: Matrix is empty."); + + assert (a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1); + if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) + InvalidArgument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows."); + + auto& us=*this; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); + + ElemType smallValue = EPS_IN_INVERSE; +#pragma omp parallel for + for (long j=0; j= 0 && v < smallValue) + us(i,j) /= smallValue; + else if (v < 0 && v > -smallValue) + us(i,j) /= (-smallValue); + else + us(i,j) /= v; + } + } + + return *this; + } + + + //[this]=1 ./ a + template + CPUMatrix& CPUMatrix::ElementInverse () + { + return AssignElementInverseOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignElementInverseOf (const CPUMatrix& a) + { + ElemType smallValue = EPS_IN_INVERSE; + + if (a.IsEmpty()) + LogicError("AssignElementInverseOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,us) + { + if (a(i,j) <0 && a(i,j) > -smallValue) + us(i,j) = 1/(-smallValue); + else if (a(i,j) >=0 && a(i,j) < smallValue) + us(i,j) = 1/smallValue; + else + us(i,j) = 1 / a(i,j); + } + + return *this; + } + + //[this]=sigmoid([this]) element wise + template + CPUMatrix& CPUMatrix::InplaceSigmoid () + { + return AssignSigmoidOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignSigmoidOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSigmoidOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,us) + { + if (a(i,j) >= 0) + us(i,j) = 1 / (1+exp(-a(i,j))); + else + { + ElemType v = exp(a(i,j)); + us(i,j) = v / (1+v); + } + } + + return *this; + } + + template + CPUMatrix& CPUMatrix::InplaceLinearRectifierDerivative () + { + return AssignLinearRectifierDerivativeOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignLinearRectifierDerivativeOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignLinearRectifierDerivativeOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j 0.0f ? 1.0f : 0.0f; + us(i+1,j) = a(i+1,j) > 0.0f ? 1.0f : 0.0f; + us(i+2,j) = a(i+2,j) > 0.0f ? 1.0f : 0.0f; + us(i+3,j) = a(i+3,j) > 0.0f ? 1.0f : 0.0f; + } + //handle remaining stuffs + for (long i=m & ~3; i 0.0f ? 1.0f : 0.0f; + } + } + + + return *this; + } + + template + CPUMatrix& CPUMatrix::InplaceSigmoidDerivative () + { + return AssignSigmoidDerivativeOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignSigmoidDerivativeOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSigmoidDerivativeOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceTanh () + { + return AssignTanhOf(*this); + } + + + template + CPUMatrix& CPUMatrix::AssignTanhOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignTanhOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceLogSoftmax (const bool isColWise) + { + return AssignLogSoftmaxOf(*this, isColWise); + } + + template + CPUMatrix& CPUMatrix::AssignLogSoftmaxOf (const CPUMatrix& a, const bool isColWise) + { + if (a.IsEmpty()) + LogicError("AssignLogSoftmaxOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + if (isColWise) + { +#pragma omp parallel for + foreach_column(j,a) + { + //we need to extract max before applying exp to avoid overflow + ElemType maxV = a(0,j); + foreach_row(i, a) + maxV = max(maxV, a(i,j)); + + ElemType sum = 0; + foreach_row(i, a) + sum += exp(us(i,j) = a(i,j) - maxV); + sum = log(sum); + foreach_row(i, us) + us(i,j) -= sum; + } + + } + else + { +#pragma omp parallel for + foreach_row(i, a) + { + //we need to extract max before applying exp to avoid overflow + ElemType maxV = a(i,0); + foreach_column(j,a) + maxV = max(maxV, a(i,j)); + + ElemType sum = 0; + foreach_column(j,a) + sum += exp(us(i,j) = a(i,j) - maxV); + sum = log(sum); + foreach_column(j,us) + us(i,j) -= sum; + } + + } + + return *this; + } + + //[this]=sqrt([this]) element wise + template + CPUMatrix& CPUMatrix::InplaceSqrt () + { + return AssignSqrtOf(*this); + } + + //to prevent negative values caused by floating operations, we force inputs to be >=0 + //this may, however, hide problems in the caller. + template + CPUMatrix& CPUMatrix::AssignSqrtOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSqrtOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceExp () + { + return AssignExpOf(*this); + } + + + template + CPUMatrix& CPUMatrix::AssignExpOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignExpOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceAbs () + { + return AssignAbsOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignAbsOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignAbsOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceLog () + { + return AssignLogOf(*this); + } + + //[this]=log([this]) element wise + template + CPUMatrix& CPUMatrix::InplaceLog10 () + { + return AssignLog10Of(*this); + } + + template + CPUMatrix& CPUMatrix::AssignLogOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignLogOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + const ElemType v = a(i,j); + if (v < EPS_IN_LOG) + { + us(i,j) = LOG_OF_EPS_IN_LOG; + } + else + us(i,j) = log(v); + } + + return *this; + } + + template + CPUMatrix& CPUMatrix::AssignLog10Of (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignLogOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + const ElemType v = a(i,j); + if (v <= 0) + LogicError("AssignLogOf: Log can only applied to numbers larger than 0."); + else if (v < EPS_IN_LOG) + { + us(i,j) = LOG10_OF_EPS_IN_LOG; + } + else + us(i,j) = log10(v); + } + + + return *this; + } + + //[this]=cos([this]) element wise + template + CPUMatrix& CPUMatrix::InplaceCosine () + { + return AssignCosineOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignCosineOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignCosineOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + const ElemType v = a(i,j); + us(i,j) = cos(v); + } + + return *this; + } + + //[this]=-sin([this]) element wise + template + CPUMatrix& CPUMatrix::InplaceNegativeSine () + { + return AssignNegativeSineOf(*this); + } + + template + CPUMatrix& CPUMatrix::AssignNegativeSineOf (const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignCosineOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + const ElemType v = a(i,j); + us(i,j) = -sin(v); + } + + return *this; + } + + + //Threshold truncating: this[i] = max( this[i], threshold ) + template + CPUMatrix& CPUMatrix::InplaceTruncateBottom (const ElemType threshold) + { + if (IsEmpty()) + LogicError("InplaceTruncateBottom: Matrix is empty."); + + auto& us=*this; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j + CPUMatrix& CPUMatrix::InplaceTruncate (const ElemType threshold) + { + if (IsEmpty()) + LogicError("InplaceTruncate: Matrix is empty."); + + auto& us=*this; + ElemType locThresholdPos = abs(threshold); + ElemType locTHresholdNeg = -locThresholdPos; + + long m=(long)GetNumRows(), n=(long)GetNumCols(); +#pragma omp parallel for + for (long j=0; j locThresholdPos ) + us(i,j) = locThresholdPos ; + else if (us(i,j) < locTHresholdNeg ) + us(i,j) = locTHresholdNeg ; + + if (us(i+1,j) > locThresholdPos ) + us(i+1,j) = locThresholdPos ; + else if (us(i+1,j) < locTHresholdNeg ) + us(i+1,j) = locTHresholdNeg ; + + if (us(i+2,j) > locThresholdPos ) + us(i+2,j) = locThresholdPos ; + else if (us(i+2,j) < locTHresholdNeg ) + us(i+2,j) = locTHresholdNeg ; + + if (us(i+3,j) > locThresholdPos ) + us(i+3,j) = locThresholdPos ; + else if (us(i+3,j) < locTHresholdNeg ) + us(i+3,j) = locTHresholdNeg ; + } + //handle remaining stuffs + for (long i=m & ~3; i locThresholdPos ) + us(i,j) = locThresholdPos ; + else if (us(i,j) < locTHresholdNeg ) + us(i,j) = locTHresholdNeg ; + } + } + + return *this; + } + + //x= x-threshold if x>threshold, x+threshold if x<-threshold, 0 otherwise + template + CPUMatrix& CPUMatrix::InplaceSoftThreshold(const ElemType threshold) + { + if (IsEmpty()) + LogicError("InplaceTruncate: Matrix is empty."); + + long m = (long)GetNumElements(); + +#pragma omp parallel for + for (long i = 0; i<(m & ~3); i += 4) //four-way unrolling + { + if (m_pArray[i] > threshold) + m_pArray[i] -= threshold; + else if (m_pArray[i] < -threshold) + m_pArray[i] += threshold; + else + m_pArray[i] = 0; + + if (m_pArray[i+1] > threshold) + m_pArray[i+1] -= threshold; + else if (m_pArray[i+1] < -threshold) + m_pArray[i+1] += threshold; + else + m_pArray[i+1] = 0; + + if (m_pArray[i+2] > threshold) + m_pArray[i+2] -= threshold; + else if (m_pArray[i+2] < -threshold) + m_pArray[i+2] += threshold; + else + m_pArray[i+2] = 0; + + if (m_pArray[i+3] > threshold) + m_pArray[i+3] -= threshold; + else if (m_pArray[i+3] < -threshold) + m_pArray[i+3] += threshold; + else + m_pArray[i+3] = 0; + } + //handle remaining stuffs + for (long i = m & ~3; i threshold) + m_pArray[i] -= threshold; + else if (m_pArray[i] < -threshold) + m_pArray[i] += threshold; + else + m_pArray[i] = 0; + } + + return *this; + } + + //Threshold truncating: this[i] = max( a[i], threshold ) + template + CPUMatrix& CPUMatrix::AssignTruncateBottomOf (const CPUMatrix& a, const ElemType threshold) + { + if (a.IsEmpty()) + LogicError("AssignTruncateBottomOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + if (a(i,j) < threshold) + us(i,j) = threshold; + else + us(i,j) = a(i,j); + } + + return *this; + } + + //Threshold truncating: this[i] = min( this[i], threshold ) + template + CPUMatrix& CPUMatrix::InplaceTruncateTop (const ElemType threshold) + { + if (IsEmpty()) + LogicError("InplaceTruncateTop: Matrix is empty."); + + auto& us=*this; + +#pragma omp parallel for + foreach_coord(i,j,us) + { + if (us(i,j) > threshold) + us(i,j) = threshold; + } + + return *this; + } + + //Threshold truncating: this[i] = min( a[i], threshold ) + template + CPUMatrix& CPUMatrix::AssignTruncateTopOf (const CPUMatrix& a, const ElemType threshold) + { + if (a.IsEmpty()) + LogicError("AssignTruncateTopOf: Matrix a is empty."); + + auto& us=*this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_coord(i,j,a) + { + if (a(i,j) > threshold) + us(i,j) = threshold; + else + us(i,j) = a(i,j); + } + + return *this; + } + //Threshold truncating: this[i] = 0 if abs(this[i] + CPUMatrix& CPUMatrix::SetToZeroIfAbsLessThan (const ElemType threshold) + { + if (IsEmpty()) + LogicError("SetToZeroIfAbsLessThan: Matrix is empty."); + + auto& us=*this; + +#pragma omp parallel for + foreach_coord(i,j,us) + { + if (abs(us(i,j)) < threshold) + us(i,j) = 0; + } + + return *this; + } + + //sum of all abs(elements) + template + ElemType CPUMatrix::SumOfAbsElements () const + { + if (IsEmpty()) + LogicError("SumOfAbsElements: Matrix is empty."); + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + return (ElemType)dasum((int)GetNumElements(), reinterpret_cast (m_pArray), 1); +#else + return (ElemType)cblas_dasum((int)GetNumElements(), reinterpret_cast (m_pArray), 1); +#endif + } + else + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + return sasum((int)GetNumElements(), reinterpret_cast (m_pArray), 1); +#else + return cblas_sasum ((int)GetNumElements(), reinterpret_cast (m_pArray), 1); +#endif + } + } + + //sum of all elements + template + ElemType CPUMatrix::SumOfElements () const + { + if (IsEmpty()) + LogicError("SumOfElements: Matrix is empty."); + + ElemType sum=0; + long m=(long)GetNumElements(); // note: OpenMP requires loop indices to be long, not size_t + + //four-way unrolling +#pragma omp parallel for reduction(+:sum) + for (long i=0; i<(m & ~3); i+=4) + { + sum += m_pArray[i] + m_pArray[i+1] + m_pArray[i+2] + m_pArray[i+3] ; + } + //handle remaining stuffs + for (long i=m & ~3; i + CPUMatrix& CPUMatrix::AssignSumOfElements(const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSumOfElements: Matrix a is empty."); + + auto& us=*this; + us.Resize(1,1); + us(0,0) =a.SumOfElements(); + + return *this; + } + + template + bool CPUMatrix::IsEqualTo(const CPUMatrix& a, const ElemType threshold /*= 1e-8*/) const + { + return AreEqual(*this, a, threshold); + } + + + template + void CPUMatrix::VectorSum(const CPUMatrix& a, CPUMatrix& c, const bool isColWise) + { + if (a.IsEmpty()) + LogicError("VectorSum: Input matrix a is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + + assert(m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + c.Resize(1, n); + +#pragma omp parallel for + foreach_column(j, a) + { + ElemType v = 0; + foreach_row(i, a) + { +#pragma omp atomic + v += a(i, j); + } + c(0, j) = v; + } + } + else + { + c.Resize(m, 1); + +#pragma omp parallel for + foreach_row(i, a) + { + ElemType v = 0; + foreach_column(j, a) + { +#pragma omp atomic + v += a(i, j); + } + c(i, 0) = v; + } + } + } + + template + void CPUMatrix::VectorNorm1(CPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + LogicError("VectorNorm1: Matrix is empty."); + + auto& us=*this; + + const int m = (int)us.GetNumRows(); + const int n = (int)us.GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + c.Resize(1, n); + +#pragma omp parallel for + foreach_column(j,us) + { + ElemType v = 0; + foreach_row(i,us) + { +#pragma omp atomic + v += abs(us(i,j)); + } + c(0,j) = v; + } + + + } + else + { + c.Resize(m, 1); + +#pragma omp parallel for + foreach_row(i,us) + { + ElemType v = 0; + foreach_column(j,us) + { +#pragma omp atomic + v += abs(us(i,j)); + } + c(i,0) = v; + } + + + } + } + + template + CPUMatrix& CPUMatrix::AssignVectorNorm1Of(CPUMatrix& a, const bool isColWise) + { + a.VectorNorm1(*this, isColWise); + return *this; + } + + template + void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + LogicError("VectorNorm2: Matrix is empty."); + + auto& us=*this; + + const int m = (int)us.GetNumRows(); + const int n = (int)us.GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + c.Resize(1, n); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_column(j,c) + { +#ifndef USE_MKL + c(0,j) = (ElemType) dnrm2(m, reinterpret_cast (us.m_pArray+us.LocateColumn(j)), 1); +#else + c(0,j) = (ElemType) cblas_dnrm2 (m, reinterpret_cast (us.m_pArray+us.LocateColumn(j)), 1); +#endif + } + } + else + { +#pragma omp parallel for + foreach_column(j,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + c(0,j) = snrm2(m, reinterpret_cast (us.m_pArray+us.LocateColumn(j)), 1); +#else + c(0,j) = cblas_snrm2 (m, reinterpret_cast (us.m_pArray+us.LocateColumn(j)), 1); +#endif + } + } + } + else + { + c.Resize(m, 1); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_row(i,c) + { +#ifndef USE_MKL + c(i,0) = dnrm2(n, reinterpret_cast (us.m_pArray+i), m); +#else + c(i,0) = cblas_dnrm2 (n, reinterpret_cast (us.m_pArray+i), m); +#endif + } + } + else + { +#pragma omp parallel for + foreach_row(i,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + c(i,0) = snrm2(n, reinterpret_cast (us.m_pArray+i), m); +#else + c(i,0) = cblas_snrm2 (n, reinterpret_cast (us.m_pArray+i), m); +#endif + } + + } + } + } + + template + CPUMatrix& CPUMatrix::AssignVectorNorm2Of(CPUMatrix& a, const bool isColWise) + { + a.VectorNorm2(*this, isColWise); + return *this; + } + + template + void CPUMatrix::VectorNormInf(CPUMatrix& c, const bool isColWise) const + { + if (IsEmpty()) + LogicError("VectorNormInf: Matrix is empty."); + + auto& us=*this; + + const int m = (int)us.GetNumRows(); + const int n = (int)us.GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + c.Resize(1, n); + + //#pragma omp parallel for + foreach_column(j,us) + { + ElemType v = 0; + foreach_row(i,us) + { + v = max( v, abs(us(i,j))); + } + c(0,j) = v; + } + + + } + else + { + c.Resize(m, 1); + + //#pragma omp parallel for + foreach_row(i,us) + { + ElemType v = 0; + foreach_column(j,us) + { + v = max( v, abs(us(i,j))); + } + c(i,0) = v; + } + + } + } + + template + CPUMatrix& CPUMatrix::AssignVectorNormInfOf(CPUMatrix& a, const bool isColWise) + { + a.VectorNormInf(*this, isColWise); + return *this; + } + + template + CPUMatrix& CPUMatrix::AssignInnerProductOf(const CPUMatrix& a, const CPUMatrix& b, const bool isColWise) + { + InnerProduct (a, b, *this,isColWise); + return *this; + } + + //column-wise crossproduct + template + CPUMatrix& CPUMatrix::AssignKhatriRaoProductOf(const CPUMatrix& a, const CPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignKhatriRaoProductOf: Matrix is empty."); + + long cols = (long) a.GetNumCols(); + assert(cols == b.GetNumCols()); + if (cols != b.GetNumCols()) + InvalidArgument("a.GetNumCols() != b.GetNumCols()"); + + long rowsA = (long) a.GetNumRows(); + long rowsB = (long) b.GetNumRows(); + Resize(rowsA * rowsB, cols); + +#ifdef __INTEL_COMPILER // TODO: check this +#pragma simd statement +#endif +#pragma omp parallel for + for (long k=0; k + CPUMatrix& CPUMatrix::AddColumnReshapeProductOf(const CPUMatrix& a, const CPUMatrix& b, const bool transposeAColumn) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AddColumnReshapeProductOf: Matrix is empty."); + + long cols = (long) a.GetNumCols(); + assert(cols == b.GetNumCols()); + if (cols != b.GetNumCols()) + InvalidArgument("AddColumnReshapeProductOf: a.GetNumCols() != b.GetNumCols()"); + + long rowsA = (long) a.GetNumRows(); + long rowsB = (long) b.GetNumRows(); + + if (rowsA % rowsB != 0) + InvalidArgument("AddColumnReshapeProductOf: number of rows in a should be multiples of that in b."); + + long rowsC = rowsA / rowsB; + if (rowsC != GetNumRows() || cols != GetNumCols()) + InvalidArgument("AddColumnReshapeProductOf: This matrix does not have the right size."); + + auto & us = *this; + + if (transposeAColumn) + { + //find nrows and ncols of tbe reshaped a + long nrows = rowsB; + long ncols = rowsC; + +#ifdef __INTEL_COMPILER // TODO: check this +#pragma simd statement +#endif +#pragma omp parallel for + foreach_column(t, a) + { + size_t k=0; + for (size_t j=0; j + CPUMatrix& CPUMatrix::AddWithScaleOf(ElemType alpha, const CPUMatrix& a) + { + ScaleAndAdd(alpha, a, *this); + return *this; + } + + template + ElemType CPUMatrix::FrobeniusNorm() const + { + if (IsEmpty()) + LogicError("FrobeniusNorm: Matrix is empty."); + + ElemType v = 0; + + long m=(long)GetNumElements(); + + //four-way unrolling +#pragma omp parallel for reduction(+:v) + for (long i=0; i<(m & ~3); i+=4) + { + v += m_pArray[i] * m_pArray[i] + m_pArray[i+1] * m_pArray[i+1] + m_pArray[i+2] * m_pArray[i+2] + m_pArray[i+3] * m_pArray[i+3]; + } + //handle remaining stuffs + for (long i=m & ~3; i + CPUMatrix& CPUMatrix::AssignFrobeniusNormOf(const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignFrobeniusNormOf: Matrix a is empty."); + + auto& us=*this; + us.Resize(1,1); + us(0,0) = a.FrobeniusNorm(); + + return us; + } + + template + ElemType CPUMatrix::MatrixNormInf() const + { + if (IsEmpty()) + LogicError("MatrixNormInf: Matrix is empty."); + + auto& us=*this; + + ElemType v = 0; +#pragma omp parallel for + foreach_coord(i,j,us) + { +#pragma omp critical + { + v = max( v, abs(us(i,j))); + } + } + return v; + } + + template + ElemType CPUMatrix::MatrixNorm0() const + { + if (IsEmpty()) + LogicError("MatrixNorm0: Matrix is empty."); + + auto& us=*this; + + ElemType v = 0; +#pragma omp parallel for + foreach_coord(i,j,us) + { + if (us(i,j)!=0) + { +#pragma omp critical + { + ++v; + } + } + } + return v; + } + + template + ElemType CPUMatrix::MatrixNorm1() const + { + if (IsEmpty()) + LogicError("MatrixNorm1: Matrix is empty."); + + auto& us=*this; + + ElemType sum = 0; +#pragma omp parallel for reduction(+:sum) + foreach_coord(i,j,us) + { + sum += abs(us(i,j)); + } + return sum; + } + + template + CPUMatrix& CPUMatrix::AssignSignOf(const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AssignSignOf: Matrix a is empty."); + + auto& us = *this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_column(j, us) + { + foreach_row(i, us) + { + ElemType v = a(i, j); + if (!std::isnan(v)) + us(i, j) = (v == (ElemType)0 ? (ElemType)0 : (v > 0 ? (ElemType)1 : (ElemType)(-1))); + else + us(i, j) = v; + } + } + + return us; + } + + template + CPUMatrix& CPUMatrix::AddSignOf(const CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("AddSignOf: Matrix a is empty."); + + auto& us = *this; + if (this != &a) + Resize(a.GetNumRows(), a.GetNumCols()); + +#pragma omp parallel for + foreach_column(j, us) + { + foreach_row(i, us) + { + ElemType v = a(i, j); + if (!std::isnan(v)) + us(i, j) += (v == (ElemType)0 ? (ElemType)0 : (v > 0 ? (ElemType)1 : (ElemType)(-1))); + else + us(i, j) = v; + } + } + + return us; + } + //I decided to use CPUMatrix& maxIndexes instead of integer vector because the result may be used to do additional calculation + template + void CPUMatrix::VectorMax(CPUMatrix& maxIndexes, CPUMatrix& maxValues, const bool isColWise, int topK) const + { + if (IsEmpty()) + LogicError("VectorMax: Matrix is empty."); + + auto& us=*this; + const int m = (int)GetNumRows(); + const int n = (int)GetNumCols(); + assert(topK <= m); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + maxValues.Resize(topK, n); + maxIndexes.Resize(topK, n); + + if (topK == 1) + { +#pragma omp parallel for + for (int j = 0; j < n; j++) + { + ElemType v = us(0, j); + size_t index = 0; + foreach_row(i, us) + { + if (v < us(i, j)) + { + index = i; + v = us(i, j); + } + } + maxValues(0, j) = v; + maxIndexes(0, j) = (ElemType)index; + } + } + else + { + std::vector indices(m); + int i = 0; + std::generate(indices.begin(), indices.end(), [&i] { return i++; }); + + const ElemType* curVal = m_pArray; + ElemType* curIdx = maxIndexes.m_pArray; + ElemType* curMax = maxValues.m_pArray; + for (int icol = 0; icol < n; icol++, curVal += m, curIdx += topK, curMax += topK) + { + // Partial sort, descending order. + std::nth_element(indices.begin(), indices.begin() + topK, indices.end(), + [curVal](const int& a, const int& b) { return curVal[a] > curVal[b]; }); + // REVIEW alexeyk: the following produces warning (see SCL_SECURE_NO_WARNINGS) so use loop instead. + //std::transform(indices.begin(), indices.begin() + topK, curIdx, [](const int& a) { return static_cast(a); }); + for (int i = 0; i < topK; i++) + { + curIdx[i] = static_cast(indices[i]); + curMax[i] = curVal[indices[i]]; + } + } + } + } + else + { + if (topK > 1) + RuntimeError("Row-wise TopK max is not supported."); + + maxValues.Resize(m,1); + maxIndexes.Resize(m, 1); + +#pragma omp parallel for + for (int i=0; i + void CPUMatrix::VectorMin(CPUMatrix& minIndexes, CPUMatrix& minValues, const bool isColWise) const + { + if (IsEmpty()) + LogicError("VectorMin: Matrix is empty."); + + auto& us=*this; + const int m = (int)GetNumRows(); + const int n = (int)GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + + if (isColWise) //col-wise + { + minValues.Resize(1, n); + minIndexes.Resize(1, n); + +#pragma omp parallel for + for (int j=0; j us(i,j)) + { + index = i; + v = us(i,j); + } + + } + minValues(0,j) = v; + minIndexes(0,j) = (ElemType)index; + } + } + else + { + minValues.Resize(m,1); + minIndexes.Resize(m, 1); + +#pragma omp parallel for + for (int i=0; i us(i,j)) + { + index = j; + v = us(i,j); + } + + } + minValues(i,0) = v; + minIndexes(i,0) = (ElemType)index; + } + } + } + + template + CPUMatrix& CPUMatrix::AssignNumOfDiff(const CPUMatrix& a, const CPUMatrix& b, bool searchInCol) + { + if (a.GetNumCols() != b.GetNumCols()) + throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of columns."); + if (!searchInCol && a.GetNumRows() != b.GetNumRows()) + throw std::invalid_argument("AssignNumOfDiff: a and b must have the same number of rows."); + + ElemType n = 0; + if (!searchInCol) + { + foreach_coord(i, j, a) + { + n += (a(i, j) != b(i, j)); + } + } + else + { + size_t crow = b.GetNumRows(); + const ElemType* curCol = b.m_pArray; + for (size_t icol = 0; icol < a.GetNumCols(); icol++, curCol += crow) + { + auto res = std::find(curCol, curCol + crow, a(0, icol)); + if (res == curCol + crow) + n++; + } + } + + Resize(1, 1); //result should be one element + (*this)(0, 0) = n; + + return *this; + } + + +#pragma endregion Member BLAS Functions + +#pragma region Other helper Functions + + template + void CPUMatrix::Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const + { + if (IsEmpty()) + LogicError("Print: Matrix is empty."); + + if (rowEnd >= GetNumRows() || colEnd >= GetNumCols()) + InvalidArgument("Index out of range."); + + if (matrixName != nullptr) + fprintf (stderr, "\n###### %s (%lu, %lu) ######\n", matrixName, GetNumRows(), GetNumCols()); + else + fprintf (stderr, "\n###### Unnamed Matrix (%lu, %lu) ######\n", GetNumRows(), GetNumCols()); + + fprintf (stderr, "\n------ Print Range (%lu:%lu, %lu:%lu) ------\n", rowStart, rowEnd, colStart, colEnd); + + const auto& us = *this; + for (size_t i = rowStart; i <= rowEnd; i++) + { + for (size_t j = colStart; j <= colEnd; j++) + fprintf(stderr, "%.10f\t", us(i, j)); + fprintf (stderr, "\n"); + } + } + + template + void CPUMatrix::Print(const char* matrixName /*=nullptr*/) const + { + Print(matrixName, 0, GetNumRows()-1, 0, GetNumCols()-1); + } + + // file I/O + //matrixName is used to verify that correct matrix is read. + template + void CPUMatrix::ReadFromFile(FILE*, const char * /*matrixName*/) + { + RuntimeError("not implemented."); + } + + //matrixName is used to verify that correct matrix is read. + template + void CPUMatrix::WriteToFile(FILE*, const char * /*matrixName*/) + { + RuntimeError("not implemented."); + } + + //assume each column is an input sample. Each sample is stored in [channel, row, col] (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11) + template + CPUMatrix& CPUMatrix::AssignPackedConvolutionInput(const CPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t /*outputChannels*/, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) + { + assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); + + const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels; + const size_t packedInputColsPerSample = outputWidth * outputHeight; //output size per channel + const size_t inputDim = inputWidth*inputHeight*inputChannels; + const size_t smallBatchSize = inputSubBatch.GetNumCols(); + const long inputHeightTimesChannel = (long) (inputHeight * inputChannels); + Resize(packedInputRows, packedInputColsPerSample * smallBatchSize); + if (zeroPadding) + SetValue((ElemType)0); + + const long halfKernelWidth = (long) kernelWidth/2; + const long halfKernelHeight = (long) kernelHeight/2; + + +#pragma omp parallel for //each input element is copied to many places + for (long sample = 0; sample =0 && x1=0 && y1=0; wcol++, posyInKernel -= (long) horizontalSubsample) + { + long packRowBase = (long) (c * kernelWidth * kernelHeight + posyInKernel * kernelHeight); + for (long wrow = x0, posxInKernel = x1; wrow < (long) outputHeight && posxInKernel>=0; wrow++, posxInKernel -= (long) verticalSubsample) + { + const long packRow = packRowBase + posxInKernel; + const long packCol = packColBase + wrow; + (*this)(packRow, packCol) = currentInputValue; + } + packColBase += (long) outputHeight; + } + } + } + + return *this; + } + //assume each column is an input sample. Each sample is stored in [channel, row, col] (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11) + template + CPUMatrix& CPUMatrix::UnpackConvolutionInput(CPUMatrix& inputSubBatch, + const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, + const size_t outputWidth, const size_t outputHeight, const size_t /*outputChannels*/, + const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, + const bool zeroPadding) const + { + assert (verticalSubsample <= kernelHeight && horizontalSubsample <= kernelWidth); + + const size_t packedInputColsPerSample = outputWidth * outputHeight; //output size per channel + const size_t inputDim = inputWidth*inputHeight*inputChannels; + const size_t smallBatchSize = inputSubBatch.GetNumCols(); + const long inputHeightTimesChannel = (long) (inputHeight * inputChannels); + + const long halfKernelWidth = (long) kernelWidth/2; + const long halfKernelHeight = (long) kernelHeight/2; + +#pragma omp parallel for //each input element is copied to many places + for (long sample = 0; sample =0 && x1=0 && y1=0; wcol++, posyInKernel -= (long) horizontalSubsample) + { + long packRowBase = (long) (c * kernelWidth * kernelHeight + posyInKernel * kernelHeight); + for (long wrow = x0, posxInKernel = x1; wrow < (long) outputHeight && posxInKernel>=0; wrow++, posxInKernel -= (long) verticalSubsample) + { + const long packRow = packRowBase + posxInKernel; + const long packCol = packColBase + wrow; + currentInputValue += (*this)(packRow, packCol); + } + packColBase += (long) outputHeight; + } + inputSubBatch(id, sample) = currentInputValue; + } + } + + return inputSubBatch; + } + + //assume each column is an input sample. Each sample is stored in (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11) + template + CPUMatrix& CPUMatrix::AssignMaxPoolingResult(const CPUMatrix& inputBatch, const size_t channels, + const size_t /*inputWidth*/, const size_t inputHeight, const size_t /*inputSizePerSample*/, + const size_t /*outputWidth*/, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + const long inputHeightTimesChannel = (long) (inputHeight * channels); + const long outputHeightTimesChannel = (long) (outputHeight * channels); + const size_t batchSize = inputBatch.GetNumCols(); + Resize(outputSizePerSample, batchSize); + + // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) + // IN_ELEM_COLPOS = sample + + // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) + // OUT_ELEM_COLPOS = sample + +#pragma omp parallel for + for (long sample = 0; sample < (long) batchSize; sample ++) + { + for (long outputIndexWithinSample=0; outputIndexWithinSample + CPUMatrix& CPUMatrix::AddMaxPoolingGradient(const CPUMatrix& outputGradientBatch, const CPUMatrix& inputBatch, const CPUMatrix& outputBatch, + const size_t channels, + const size_t /*inputWidth*/, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t /*outputSizePerSample*/, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + size_t batchSize = inputBatch.GetNumCols(); + const long inputHeightTimesChannel = (long) (inputHeight * channels); + const long outputHeightTimesChannel = (long) (outputHeight * channels); + + // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) + // IN_ELEM_COLPOS = sample + + // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) + // OUT_ELEM_COLPOS = sample + +#pragma omp parallel for + for (long sample = 0; sample < batchSize; sample ++) + { + for (long inputIndexWithinSample=0; inputIndexWithinSample + CPUMatrix& CPUMatrix::AssignAveragePoolingResult(const CPUMatrix& inputBatch, const size_t channels, + const size_t /*inputWidth*/, const size_t inputHeight, const size_t /*inputSizePerSample*/, + const size_t /*outputWidth*/, const size_t outputHeight, const size_t outputSizePerSample, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + const long inputHeightTimesChannel = (long) (inputHeight * channels); + const long outputHeightTimesChannel = (long) (outputHeight * channels); + const size_t batchSize = inputBatch.GetNumCols(); + const size_t windowSize = windowWidth * windowHeight; + Resize(outputSizePerSample, batchSize); + + // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) + // IN_ELEM_COLPOS = sample + + // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) + // OUT_ELEM_COLPOS = sample + +#pragma omp parallel for + for (long sample = 0; sample < batchSize; sample ++) + { + for (long outputIndexWithinSample=0; outputIndexWithinSample + CPUMatrix& CPUMatrix::AddAveragePoolingGradient(const CPUMatrix& outputGradientBatch, + const size_t channels, + const size_t /*inputWidth*/, const size_t inputHeight, const size_t inputSizePerSample, + const size_t outputWidth, const size_t outputHeight, const size_t /*outputSizePerSample*/, + const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample) + { + size_t batchSize = outputGradientBatch.GetNumCols(); + const long inputHeightTimesChannel = (long)(inputHeight * channels); + const long outputHeightTimesChannel = (long)(outputHeight * channels); + const long windowSize = (long) (windowWidth * windowHeight); + + // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels) + // IN_ELEM_COLPOS = sample + + // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels) + // OUT_ELEM_COLPOS = sample + +#pragma omp parallel for + for (long sample = 0; sample < batchSize; sample ++) + { + for (long inputIndexWithinSample=0; inputIndexWithinSampleMatrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c + /// Scalar + /// Input matrix + /// Whether matrix a is transposed + /// Input matrix + /// Whether matrix b is transposed + /// Scalar + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, + ElemType beta, CPUMatrix& c) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("MultiplyAndWeightedAdd: one of the input matrix is empty."); + + int m, n, k, l; + int lda, ldb, ldc; +#ifndef USE_MKL + char transA, transB; +#else + CBLAS_TRANSPOSE mklTransA; + CBLAS_TRANSPOSE mklTransB; +#endif + + if (transposeA) + { + m = (int)a.GetNumCols(); + k = (int)a.GetNumRows(); + lda = k; +#ifndef USE_MKL + transA = (char)MatrixTranspose::Trans; +#else + mklTransA = CBLAS_TRANSPOSE::CblasTrans; +#endif + } + else + { + m = (int)a.GetNumRows(); + k = (int)a.GetNumCols(); + lda = m; +#ifndef USE_MKL + transA = (char)MatrixTranspose::NoTrans; +#else + mklTransA = CBLAS_TRANSPOSE::CblasNoTrans; +#endif + } + + if (transposeB) + { + l = (int)b.GetNumCols(); + n = (int)b.GetNumRows(); + ldb = n; +#ifndef USE_MKL + transB = (char)MatrixTranspose::Trans; +#else + mklTransB = CBLAS_TRANSPOSE::CblasTrans; +#endif + } + else + { + l = (int)b.GetNumRows(); + n = (int)b.GetNumCols(); + ldb = l; +#ifndef USE_MKL + transB = (char)MatrixTranspose::NoTrans; +#else + mklTransB = CBLAS_TRANSPOSE::CblasNoTrans; +#endif + } + + assert (m>0 && k>0 && l>0 && n>0); //converting from size_t to int may cause overflow + assert (k == l); + if (k != l) + InvalidArgument("CPUMatrix::MultiplyAndWeightedAdd : The inner dimensions of a and b must match."); + + c.Resize(m,n); + ldc = (int)c.GetNumRows(); + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + dgemm(transA, transB, m, n, k, alpha, reinterpret_cast (a.m_pArray), lda, reinterpret_cast (b.m_pArray), ldb, beta, reinterpret_cast (c.m_pArray), ldc); +#else + cblas_dgemm ((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast (a.m_pArray), lda, reinterpret_cast (b.m_pArray), ldb, beta, reinterpret_cast (c.m_pArray), ldc); +#endif + } + else + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + sgemm(BLAS_COLMAJOR transA, transB, m, n, k, alpha, reinterpret_cast (a.m_pArray), lda, reinterpret_cast (b.m_pArray), ldb, beta, reinterpret_cast (c.m_pArray), ldc); +#else + cblas_sgemm ((CBLAS_ORDER) BLAS_COLMAJOR mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast (a.m_pArray), lda, reinterpret_cast (b.m_pArray), ldb, beta, reinterpret_cast (c.m_pArray), ldc); +#endif + } + } + + /* compute singular value decomposition as + A = U*SIGMA*VT + W is used as temp working memory + */ + template + void CPUMatrix::SVD(const CPUMatrix& A, CPUMatrix& SIGMA, CPUMatrix& U, CPUMatrix& VT, CPUMatrix& W) + { + if (A.IsEmpty()) + LogicError("SVD: input matrix is empty."); + + int info; + int m, n, lda, ldu, ldvt; + m = (int)A.GetNumRows(); + n = (int)A.GetNumCols(); + W.GetNumRows(); //W is used as temp working memory + lda = m; + ldu = m; + ldvt = n; + U.Resize(m, m); + SIGMA.Resize(min(m, n), 1); + VT.Resize(n, n); + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + dgesvd('A', 'A', (int)m, (int)n, reinterpret_cast (A.m_pArray), (int)lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), (int)ldu, reinterpret_cast (VT.m_pArray), (int)ldvt, &info); +#else + double wkopt; + int lwork = -1; + dgesvd("All", "All", &m, &n, reinterpret_cast (A.m_pArray), &lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), &ldu, reinterpret_cast (VT.m_pArray), &ldvt, &wkopt, &lwork, &info); + lwork = (int)wkopt; + W.Resize(lwork, 1); + dgesvd("All", "All", &m, &n, reinterpret_cast (A.m_pArray), &lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), &ldu, reinterpret_cast (VT.m_pArray), &ldvt, reinterpret_cast (W.m_pArray), &lwork, &info); +#endif + } + else + { +#ifndef USE_MKL +#pragma warning (suppress: 4244) + sgesvd('A', 'A', (int)m, (int)n, reinterpret_cast (A.m_pArray), (int)lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), (int)ldu, reinterpret_cast (VT.m_pArray), (int)ldvt, &info); +#else + float wkopt; + int lwork = -1; + sgesvd("All", "All", &m, &n, reinterpret_cast (A.m_pArray), &lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), &ldu, reinterpret_cast (VT.m_pArray), &ldvt, &wkopt, &lwork, &info); + lwork = (int)wkopt; + W.Resize(lwork, 1); + sgesvd("All", "All", &m, &n, reinterpret_cast (A.m_pArray), &lda, reinterpret_cast (SIGMA.m_pArray), reinterpret_cast (U.m_pArray), &ldu, reinterpret_cast (VT.m_pArray), &ldvt, reinterpret_cast (W.m_pArray), &lwork, &info); +#endif + } + + if (info > 0) + { + RuntimeError("The algorithm computing SVD failed to converge.\n"); + } + } + + /// Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = op(a) * op(b) + c + /// Input matrix + /// Whether matrix a is transposed + /// Input matrix + /// Whether matrix b is transposed + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::MultiplyAndAdd(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, + CPUMatrix& c) + { + return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c); + } + template + void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& softmax, CPUMatrix& c) + { + ElemType log_likelihood = 0.0; + size_t batch_size = this->GetNumCols(); +#pragma omp parallel for reduction(+:log_likelihood) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + { + int sample = (int)(*this)(0, instance_id); + log_likelihood += softmax(instance_id, sample); + } + c(0, 0) = -log_likelihood; + } + + template + void CPUMatrix::AssignNCEUnnormalizedEval(const CPUMatrix& a, + const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& c) + //this: samples+probs + // a: hidden + // b: embedding + // tmp: softmax + // c: loglikelihood + { + ElemType log_likelihood = 0.0; + size_t batch_size = this->GetNumCols(); +#pragma omp parallel for reduction(+:log_likelihood) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + { + int sample = -(int)(*this)(0, instance_id); + ElemType score = bias(sample, 0); + for (int dim = 0; dim < b.GetNumRows(); dim++) + score += b(dim, sample)* a(dim, instance_id); + log_likelihood += score; + } + c(0, 0) = -log_likelihood; + } + + //samples+prob gradient hidden embedding embedding/hidden + //a.m_CPUMatrix->AssignNCEDerivative(*tmp.m_CPUMatrix, *a.m_CPUMatrix, *b.m_CPUMatrix, inputIndex, *c.m_CPUMatrix); + template + CPUMatrix& CPUMatrix::AssignNCEDerivative(const CPUMatrix& tmp, const CPUMatrix& a, const CPUMatrix& b, size_t inputIndex, CPUMatrix& c) + { + size_t sample_size = this->GetNumRows() / 2; + size_t batch_size = this->GetNumCols(); + if (inputIndex == 1) + { +#pragma omp parallel for + for (int instance_id = 0; instance_id < batch_size; instance_id++) + for (int sample_id = 0; sample_id < sample_size; sample_id++) + { + int sample = (int)(*this)(2 * sample_id, instance_id); + for (int dim = 0; dim < b.GetNumRows(); dim++) + c(dim, instance_id) -= b(dim, sample)* tmp(sample_id, instance_id); + } + } + else if (inputIndex == 2) + { + int i_blocks = omp_get_num_threads() * 16; + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. +#pragma omp parallel for + for (int ib = 0; ib < i_blocks; ib++) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + for (int sample_id = 0; sample_id < sample_size; sample_id++) + { + int sample =(int) (*this)(2 * sample_id, instance_id); + if (sample % i_blocks == ib) + for (int dim = 0; dim < b.GetNumRows(); dim++) + c(dim, sample) -= a(dim, instance_id)* tmp(sample_id, instance_id); + } + } + else + { + assert(inputIndex == 3); + // Assume only one block in k direction. + // We don't need to explicitly block in the j direction. + for (int instance_id = 0; instance_id < batch_size; instance_id++) + for (int sample_id = 0; sample_id < sample_size; sample_id++) + { + int sample =(int) (*this)(2 * sample_id, instance_id); + c(0, sample) -= tmp(sample_id, instance_id); + } + } + return *this; + } + + template + void CPUMatrix::AssignNoiseContrastiveEstimation(const CPUMatrix& a, + const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& tmp, CPUMatrix& c) + //this: samples+probs + // a: hidden + // b: embedding + // tmp: softmax + // c: loglikelihood + { + double log_likelihood = 0.0; + size_t sample_size = this->GetNumRows() / 2; + size_t batch_size = this->GetNumCols(); + size_t num_noise_samples = sample_size - 1; + double log_num_noise_samples = std::log(num_noise_samples); +#pragma omp parallel for reduction(+:log_likelihood) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + for (int sample_id = 0; sample_id < sample_size; sample_id++) + { + int sample = (int)(*this)(2 * sample_id, instance_id); + double score = bias(0, sample); + for (int dim = 0; dim < b.GetNumRows(); dim++) + score += a(dim, instance_id)* b(dim, sample); + double sample_prob = -(*this)(2 * sample_id + 1, instance_id); + if (sample_id == 0) + sample_prob = -sample_prob; + double score_noise = log_num_noise_samples + sample_prob; + double z = logadd(score, score_noise); + double logprob = score - z; + double logprob_noise = score_noise - z; + tmp(sample_id, instance_id) = (ElemType)-std::exp(logprob); + if (sample_id == 0) + tmp(sample_id, instance_id) += 1; + log_likelihood += sample_id == 0 ? logprob : logprob_noise; + } + c(0, 0) = (ElemType)-log_likelihood; + } + + /// Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = op(a) * op(b) + /// Input matrix + /// Whether matrix a is transposed + /// Input matrix + /// Whether matrix b is transposed + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::Multiply(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, + CPUMatrix& c) + { + return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 0.0, c); + } + + /// Matrix-matrix multiply with col-major matrices (a and b are not transposed): c = a * b + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::Multiply(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c) + { + return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, false, b, false, 0.0, c); + } + + + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c + /// if a is a column vector, add to all columns of c + /// if a is a row vector, add to all rows of c + /// if a is a scalar, add to all rows of c + /// Scalar + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& a, CPUMatrix& c) + { + if (a.IsEmpty() || c.IsEmpty()) + LogicError("ScaleAndAdd: one of the input matrices is empty."); + + if (a.GetNumRows() != 1 && a.GetNumCols() != 1) // a is not a col or row vector + { + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int len = m * n; + const int incx = 1; + const int incy = 1; + + assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow + assert ((int)c.GetNumRows() == m && (int)c.GetNumCols() == n); + if ((int)c.GetNumRows() != m || (int)c.GetNumCols() != n) + InvalidArgument("Dimension of matrix c does not match dimension of matrix a."); + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + daxpy(len, alpha, reinterpret_cast (a.m_pArray), incx, reinterpret_cast (c.m_pArray), incy); +#else + cblas_daxpy(len, alpha, reinterpret_cast (a.m_pArray), incx, reinterpret_cast (c.m_pArray), incy); +#endif + } + else + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + saxpy(len, alpha, reinterpret_cast (a.m_pArray), incx, reinterpret_cast (c.m_pArray), incy); +#else + cblas_saxpy(len, alpha, reinterpret_cast (a.m_pArray), incx, reinterpret_cast (c.m_pArray), incy); +#endif + } + } + else if (a.GetNumElements() == 1) //scalar, add to all elements + { + ElemType v = alpha*a(0,0); + long m=(long)c.GetNumRows(), n=(long)c.GetNumCols(); +#pragma omp parallel for + for (long j=0; j(a.m_pArray), 1, reinterpret_cast (c.m_pArray+c.LocateColumn(j)), 1); +#else + cblas_daxpy (m, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+c.LocateColumn(j)), 1); +#endif + } + } + else + { +#pragma omp parallel for + foreach_column(j,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + saxpy(m, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+c.LocateColumn(j)), 1); +#else + cblas_saxpy (m, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+c.LocateColumn(j)), 1); +#endif + } + } + } + else //row vector, add it to all rows + { + int m = (int)c.GetNumRows(); + int n = (int)c.GetNumCols(); + assert (n == (int)a.GetNumCols()); + if (n != (int)a.GetNumCols()) + InvalidArgument("To add row vector, cols should match."); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_row(i,c) + { +#ifndef USE_MKL + daxpy(n, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+i), m); +#else + cblas_daxpy (n, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+i), m); +#endif + } + } + else + { +#pragma omp parallel for + foreach_row(i,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + saxpy(n, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+i), m); +#else + cblas_saxpy (n, alpha, reinterpret_cast (a.m_pArray), 1, reinterpret_cast (c.m_pArray+i), m); +#endif + + } + } + } + } + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::AddScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c) + { + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols()); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumRows() == c.GetNumRows() && + a.GetNumCols() == b.GetNumCols() && a.GetNumCols() == c.GetNumCols())) + { + InvalidArgument("AddScaledDifference: a, b, and c must have same dimension."); + } + + if (a.IsEmpty()) + LogicError("AddScaledDifference: Input matrix a is empty."); + + long m=(long)c.GetNumElements(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + c.m_pArray[i] += alpha * (a.m_pArray[i]-b.m_pArray[i]); + c.m_pArray[i+1] += alpha * (a.m_pArray[i+1]-b.m_pArray[i+1]); + c.m_pArray[i+2] += alpha * (a.m_pArray[i+2]-b.m_pArray[i+2]); + c.m_pArray[i+3] += alpha * (a.m_pArray[i+3]-b.m_pArray[i+3]); + } + //handle remaining stuffs + for (long i=m & ~3; i c = alpha * (a-b) + /// if a, b, c must have same dim + /// Scalar + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::AssignScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c) + { + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols() ); + + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + { + InvalidArgument("AssignScaledDifference: a, b must have same dimension."); + } + + if (a.IsEmpty()) + LogicError("AssignScaledDifference: Input matrix a is empty."); + + if (&c != &a && &c != &b) + c.Resize(a.GetNumRows(), a.GetNumCols()); + + long m=(long)c.GetNumElements(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(m & ~3); i+=4) + { + c.m_pArray[i] = alpha * (a.m_pArray[i]-b.m_pArray[i]); + c.m_pArray[i+1] = alpha * (a.m_pArray[i+1]-b.m_pArray[i+1]); + c.m_pArray[i+2] = alpha * (a.m_pArray[i+2]-b.m_pArray[i+2]); + c.m_pArray[i+3] = alpha * (a.m_pArray[i+3]-b.m_pArray[i+3]); + } + //handle remaining stuffs + for (long i=m & ~3; i + void CPUMatrix::AddElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj) + { + if (ai >= a.GetNumRows() || aj >=a.GetNumCols() || + ci >= c.GetNumRows() || cj >=c.GetNumCols()) + InvalidArgument("AddElementToElement: index out of range."); + + c(ci, cj) += a(ai, aj); + } + + ////c[ci,cj] += a[ai,aj] + //template + //void CPUMatrix::AddLogElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj) + //{ + // if (ai >= a.GetNumRows() || aj >=a.GetNumCols() || + // ci >= c.GetNumRows() || cj >=c.GetNumCols()) + // InvalidArgument("AddElementToElement: index out of range."); + // + // ElemType v = a(ai,aj); + // c(ci, cj) += ((v < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : log(v)); + //} + + //c[ci,cj] = a[ai,aj] + template + void CPUMatrix::AssignElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj) + { + if (ai >= a.GetNumRows() || aj >=a.GetNumCols() || + ci >= c.GetNumRows() || cj >=c.GetNumCols()) + InvalidArgument("AssignElementToElement: index out of range."); + + c(ci, cj) = a(ai, aj); + } + + /// c += alpha * (a-b) + /// if a, b, c must have same dim + /// 1X1 matrix + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::AddScaledDifference(const CPUMatrix& alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c) + { + assert(alpha.GetNumElements() == 1); + if (!(alpha.GetNumElements() == 1)) + InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix."); + + AddScaledDifference(alpha(0,0), a, b, c); + } + + /// c = alpha * (a-b) + /// if a, b, c must have same dim + /// 1X1 matrix + /// Input matrix + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::AssignScaledDifference(const CPUMatrix& alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c) + { + assert(alpha.GetNumElements() == 1); + if (!(alpha.GetNumElements() == 1)) + InvalidArgument("AddScaledDifference: alpha must be a 1X1 matrix."); + + AssignScaledDifference(alpha(0,0), a, b, c); + } + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + /// Scalar + /// Input matrix + /// Resulting matrix, user is responsible for allocating this + template + void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CPUMatrix& c) + { + if (a.IsEmpty()) + LogicError("Scale: Input matrix a is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + + assert (m>0 && n>0); //converting from size_t to int may cause overflow + c.Resize(m,n); + + long size=(long)c.GetNumElements(); +#pragma omp parallel for + //four-way unrolling + for (long i=0; i<(size & ~3); i+=4) + { + c.m_pArray[i] = alpha * a.m_pArray[i]; + c.m_pArray[i+1] = alpha * a.m_pArray[i+1]; + c.m_pArray[i+2] = alpha * a.m_pArray[i+2]; + c.m_pArray[i+3] = alpha * a.m_pArray[i+3]; + } + //handle remaining stuffs + for (long i=size & ~3; iMatrix-scalar multiply with col-major matrices: a = alpha * a + /// Scalar + /// Input matrix + template + void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("Scale: Input matrix a is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int len = m * n; + const int incx = 1; + + assert (m>0 && n>0 && len>0); //converting from size_t to int may cause overflow + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + dscal(len, alpha, reinterpret_cast (a.m_pArray), incx); +#else + cblas_dscal(len, alpha, reinterpret_cast (a.m_pArray), incx); +#endif + } + else + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + sscal(len, alpha, reinterpret_cast (a.m_pArray), incx); +#else + cblas_sscal (len, alpha, reinterpret_cast (a.m_pArray), incx); +#endif + } + } + + /// Matrix multiply with col-major matrices: a = alpha[1,1] * a + /// 1x1 matrix + /// Input matrix + template + void CPUMatrix::Scale(CPUMatrix alpha, CPUMatrix& a) + { + if (a.IsEmpty()) + LogicError("Scale: Input matrix a is empty."); + if (alpha.GetNumElements()!=1) + LogicError("Matrix alpha must be 1x1"); + CPUMatrix::Scale(alpha(0,0),a); + } + + template + void CPUMatrix::InnerProduct (const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const bool isColWise) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProduct: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) + InvalidArgument("InnerProduct: Matrices a and b should have same dimension."); + + if ((isColWise && m == 1) || !isColWise && n == 1) //in this case it's equivalent to element-wise product + { + c.AssignElementProductOf(a, b); + } + else if (isColWise) //col-wise + { + c.Resize(1,n); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_column(j,c) + { +#ifndef USE_MKL + c(0,j) = (ElemType)ddot(m, reinterpret_cast (a.m_pArray+a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray+b.LocateColumn(j)), 1); +#else + c(0,j) = (ElemType)cblas_ddot(m, reinterpret_cast (a.m_pArray+a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray+b.LocateColumn(j)), 1); +#endif + } + } + else + { +#pragma omp parallel for + foreach_column(j,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + c(0,j) = (ElemType)sdot(m, reinterpret_cast (a.m_pArray+a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray+b.LocateColumn(j)), 1); +#else + c(0,j) = (ElemType)cblas_sdot(m, reinterpret_cast (a.m_pArray+a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray+b.LocateColumn(j)), 1); +#endif + } + } + } + else + { + c.Resize(m, 1); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_row(i,c) + { +#ifndef USE_MKL + c(i,0) = ddot(n, reinterpret_cast (a.m_pArray+i), m, reinterpret_cast (b.m_pArray+i), m); +#else + c(i,0) = cblas_ddot (n, reinterpret_cast (a.m_pArray+i), m, reinterpret_cast (b.m_pArray+i), m); +#endif + } + } + else + { +#pragma omp parallel for + foreach_row(i,c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + c(i,0) = sdot(n, reinterpret_cast (a.m_pArray+i), m, reinterpret_cast (b.m_pArray+i), m); +#else + c(i,0) = cblas_sdot (n, reinterpret_cast (a.m_pArray+i), m, reinterpret_cast (b.m_pArray+i), m); +#endif + } + } + } + } + + // treat matrices as vectors. do vec(a)^T vec(b) + template + ElemType CPUMatrix::InnerProductOfMatrices(const CPUMatrix& a, const CPUMatrix& b) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProductOfMatrices: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert (m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert (m==k && n==l); //converting from size_t to int may cause overflow + if (m!=k || n!=l) + InvalidArgument("InnerProductOfMatrices: Matrices a and b should have same dimension."); + + if (sizeof(ElemType) == sizeof(double)) + { +#ifndef USE_MKL + return (ElemType)ddot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); +#else + return (ElemType)cblas_ddot ((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); +#endif + } + else + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + return (ElemType)sdot((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); +#else + return (ElemType)cblas_sdot ((int)a.GetNumElements(), reinterpret_cast (a.m_pArray), 1, reinterpret_cast (b.m_pArray), 1); +#endif + } + } + + template + void CPUMatrix::ElementWisePower (ElemType alpha, const CPUMatrix& a, CPUMatrix& c) + { + if (a.IsEmpty()) + LogicError("Scale: The input matrix a is empty."); + + c.Resize(a.GetNumRows(), a.GetNumCols()); + + if (alpha == 2) + { +#pragma omp parallel for + foreach_coord(i,j,c) + { + c(i,j) = a(i,j) * a(i,j); + } + } + else if (alpha == 3) + { +#pragma omp parallel for + foreach_coord(i,j,c) + { + c(i,j) = a(i,j) * a(i,j) * a(i,j); + } + } + else + { +#pragma omp parallel for + foreach_coord(i,j,c) + { + c(i,j) = pow(a(i,j), alpha); + } + } + } + + template + bool CPUMatrix::AreEqual(const CPUMatrix& a, const CPUMatrix& b, const ElemType threshold /*= 1e-8*/) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AreEqual: one of the input matrices is empty."); + + if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) + return false; + + bool result=true; +#pragma omp parallel for + foreach_coord(i, j, a) + { + if (abs(a(i,j)-b(i,j)) > threshold) + { + result = false; + break; + } + } + + return result; + } + + // see Matrix::TensorShuffleScaleAndAdd() for comments + template + void CPUMatrix::TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix& b, CPUMatrix& c) + { + size_t N = D * S * M * K * T; + const ElemType * pa = a.m_pArray; + const ElemType * pb = b.m_pArray; + ElemType * pc = c.m_pArray; + // Note: This code is written to match a GPU implementation. It is not super-efficient on the CPU. + for (size_t na = 0; na < N; na++) // loop over all elements + { + // recover the 5 indices from the loop counter + size_t d = na % D; + size_t s = (na / D ) % S; + size_t m = (na / D / S ) % M; + size_t k = (na / D / S / M ) % K; + size_t t = (na / D / S / M / K) % T; + // compute index for the a and b/c tensors + size_t na1= (((t * K + k) * M + m) * S + s) * D + d; // tensor of dimension (D x S x M x K x T) + size_t nb = (((t * S + s) * M + m) * K + k) * D + d; // tensor of dimension (D x K x M x S x T) --note: k/K and s/S swapped + assert(na == na1); na1; + assert(nb < N); + // perform the computation + ElemType cval = keepWeight ? keepWeight * pb[nb] : 0; // if weight is 0 then don't bother to read memory (efficiency) or to multiply (NaN-safe) + cval += scaleFactor * pa[na]; + pc[nb] = cval; + } + } + + template + CPUMatrix CPUMatrix::Ones(const size_t rows, const size_t cols) + { + CPUMatrix c(rows, cols); //will initialize to 0 + c.SetValue(1); + return c; + } + + template + CPUMatrix CPUMatrix::Zeros(const size_t rows, const size_t cols) + { + CPUMatrix c(rows, cols); //will initialize to 0 + c.SetValue(0); + return c; + } + + template + CPUMatrix CPUMatrix::Eye(const size_t rows) + { + CPUMatrix c(rows, rows); //will initialize to 0 + c.SetDiagonalValue(1); + return c; + } + + template + CPUMatrix CPUMatrix::RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed) + { + CPUMatrix c(rows, cols); //will initialize to 0 + c.SetUniformRandomValue(low, high, seed); + return c; + } + + template + CPUMatrix CPUMatrix::RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed) + { + CPUMatrix c(rows, cols); //will initialize to 0 + c.SetGaussianRandomValue(mean, sigma, seed); + return c; + } + + template + bool CPUMatrix::HasElement(const CPUMatrix& mat, const ElemType v) + { + bool bHas = false; + + bool isvFinite = std::isfinite(v); +#pragma omp parallel for + for (long j = 0; j < mat.GetNumElements(); j++) + { +#pragma omp flush(bHas) + if (!bHas) + { + ElemType cur = mat.m_pArray[j]; + if (isvFinite && std::isfinite(cur)) + { + if (cur == v) + bHas = true; + } + else if (std::isnan(v) && std::isnan(cur)) + bHas = true; + else if (std::isinf(v) && std::isinf(cur) && std::signbit(v) == std::signbit(cur)) + bHas = true; + } + } + + return bHas; + } + + // CPUMatrix& AssignElementProductOfWithShiftNeg(const CPUMatrix& a, const CPUMatrix& b, size_t shift, size_t negnumber); + //[this]=a .* b + // here, a and b must be two row vectors of the same size, i.e. [1,m] + // the inputs are two rwo vectors + // the output is a matrix of size(neg+1, col) + template + CPUMatrix& CPUMatrix::AssignElementProductOfWithShiftNeg(const CPUMatrix& a, const CPUMatrix& b, size_t shift, size_t negnumber) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty."); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix dimensions do not match."); + + if (a.GetNumRows() != 1) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); + + auto& us = *this; + if (this != &a) + { + Resize(negnumber + 1, a.GetNumCols()); + // Resize(a.GetNumRows(), a.GetNumCols()); + } + + long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n) + //#pragma omp parallel for + + for (long j = 0; j < n; j++) + { + us(0, j) = a(0, j) * b(0, j); + } + for (long j = 0; j + void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const bool isColWise, size_t shift, size_t negnumber) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProduct: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert(m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert(m == k && n == l); //converting from size_t to int may cause overflow + if (m != k || n != l) + InvalidArgument("InnerProduct: Matrices a and b should have same dimension."); + + if ((isColWise && m == 1) || !isColWise && n == 1) //in this case it's equivalent to element-wise product + { + InvalidArgument("InnerProduct: Both matrices should be normal ones, not vectors"); + // c.AssignElementProductOf(a, b); + } + else if (isColWise) //col-wise + { + c.Resize(negnumber + 1, n); // this line ischanged + + if (sizeof(ElemType) == sizeof(double)) + { + for (long j = 0; j < n; j++) + { +#ifndef USE_MKL + c(0, j) = (ElemType)ddot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn(j)), 1); +#else + c(0, j) = (ElemType)cblas_ddot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn(j)), 1); +#endif + } + for (long j = 0; j < n; j++) + { + for (long i = 1; i < negnumber + 1; i++) + { +#ifndef USE_MKL + c(i, j) = (ElemType)ddot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); +#else + c(i, j) = (ElemType)cblas_ddot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); +#endif + } + } + + } + else + { + for (long j = 0; j < n; j++) + { +#ifndef USE_MKL + c(0, j) = (ElemType)sdot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn(j)), 1); +#else + c(0, j) = (ElemType)cblas_sdot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn(j)), 1); +#endif + } + for (long j = 0; j < n; j++) + { + for (long i = 1; i < negnumber + 1; i++) + { +#ifndef USE_MKL + c(i, j) = (ElemType)sdot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); +#else + c(i, j) = (ElemType)cblas_sdot(m, reinterpret_cast (a.m_pArray + a.LocateColumn(j)), 1, reinterpret_cast (b.m_pArray + b.LocateColumn((j + shift + i - 1) % n)), 1); +#endif + } + } + } + } + else + { + InvalidArgument("InnerProduct: Rowwise is not supported yet"); + + c.Resize(m, 1); + + if (sizeof(ElemType) == sizeof(double)) + { +#pragma omp parallel for + foreach_row(i, c) + { +#ifndef USE_MKL + c(i, 0) = (ElemType)ddot(n, reinterpret_cast (a.m_pArray + i), m, reinterpret_cast (b.m_pArray + i), m); +#else + c(i, 0) = (ElemType)cblas_ddot(n, reinterpret_cast (a.m_pArray + i), m, reinterpret_cast (b.m_pArray + i), m); +#endif + } + } + else + { +#pragma omp parallel for + foreach_row(i, c) + { +#pragma warning (suppress: 4244) +#ifndef USE_MKL + c(i, 0) = sdot(n, reinterpret_cast (a.m_pArray + i), m, reinterpret_cast (b.m_pArray + i), m); +#else + c(i, 0) = cblas_sdot(n, reinterpret_cast (a.m_pArray + i), m, reinterpret_cast (b.m_pArray + i), m); +#endif + } + } + } + } + + + template + CPUMatrix& CPUMatrix::GetARowByIndex(const CPUMatrix& a, size_t index) + { + if (a.IsEmpty()) + LogicError("GetARowByIndex: the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + + if (index <0 || index >= m) + LogicError("GetARowByIndex: the row index is out of range."); + + assert(m>0 && n>0); //converting from size_t to int may cause overflow + + auto& us = *this; + this->Resize(1, n); + for (long j = 0; j < n; j++) + { + us(0, j) = a(index, j); + } + + return *this; + + } + + + // input: a, a row vector + // input: b, a matrix. b.col == a.col + // input firstmatrixfixed: If true, keep a's order. Otherwise, keep b's order + // output: c, a matrix. c.size == b.size + /* + Example, a = [a1 a2 a3] + b = [b11 b12 b13; + b21 b22 b23 ] + + if true: + shift = 1 + + then c = [a1*b12 a2*b13 a3*b11 + a1*b22 a2*b23 a3*b21] + + if shift = 2 + then c = [ a1*b13 a2*b11 a3*b12 + a1*b23 a2*b21 a3*b22] + i.e. we do column-wise shift + + if false: + shift = 1 + + then c = [a2*b11 a3*b12 a1*b13 + a2*b21 a3*b22 a1*b23] + + shift = 2 + + then c = [ a3*b11 a1*b12 a2*b13 + a3*b21 a1*b22 a2*b23] + + + */ + template + void CPUMatrix::ConductRowElementMultiplyWithShift(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, size_t shift, bool bFirstmatrixfixed) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("InnerProduct: one of the input matrices is empty."); + + const int m = (int)a.GetNumRows(); + const int n = (int)a.GetNumCols(); + const int k = (int)b.GetNumRows(); + const int l = (int)b.GetNumCols(); + + assert(m>0 && n>0 && k>0 && l>0); //converting from size_t to int may cause overflow + assert(m == 1 && n == l); //converting from size_t to int may cause overflow + if (m != 1 || n != l) + InvalidArgument("InnerProduct: Matrices a and b should have same dimension."); + + c.Resize(k, l); // c must the the same size of b + + if (bFirstmatrixfixed) + { + for (long j = 0; j < l; j++) + { + for (long i = 0; i < k; i++) + { + c(i, j) = a(0, j) * b(i, (j + shift) % l); + } + } + } + else + { + for (long j = 0; j < l; j++) + { + for (long i = 0; i < k; i++) + { + c(i, j) = a(0, (j + shift) % l) * b(i, j); + } + } + } + + + + } + + + // CPUMatrix& AssignElementProductOfWithShift(const CPUMatrix& a, const CPUMatrix& b, size_t shift); + //[this]=a .* b + // here, a and b must be two row vectors of the same size, i.e. [1,m]. We will do element product with shift. + // inputs are 2 row vectors + // output is a row vector + template + CPUMatrix& CPUMatrix::AssignElementProductOfWithShift(const CPUMatrix& a, const CPUMatrix& b, size_t shift) + { + if (a.IsEmpty() || b.IsEmpty()) + LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty."); + + assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); + if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix dimensions do not match."); + + if (a.GetNumRows() != 1) + InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector."); + + auto& us = *this; + if (this != &a) + { + Resize(1, a.GetNumCols()); + // Resize(a.GetNumRows(), a.GetNumCols()); + } + + //long m = (long)GetNumRows(), n = (long)GetNumCols(); // a and b are of size (1,n) + long n = (long)GetNumCols(); // a and b are of size (1,n) +#pragma omp parallel for + for (long j = 0; j + ElemType CPUMatrix::LogAddSumOfElements() const + { + ElemType fAlpha = (ElemType)LZERO; + for (int k = 0; k < GetNumElements(); k++) + fAlpha = (ElemType) logadd(fAlpha, m_pArray[k]); + return fAlpha; + } + + template + void CPUMatrix::RCRFBackwardCompute(const CPUMatrix& alpha, CPUMatrix& beta, + const CPUMatrix& lbls, + const CPUMatrix& pair_scores) + { + int iNumPos = (int)lbls.GetNumCols(); + int iNumLab = (int)lbls.GetNumRows(); + + int lastLbl = -1; + for (int ik = 0; ik < lbls.GetNumRows(); ik++) + if (lbls(ik, iNumPos - 1) != 0){ + lastLbl = ik; break; + } + + beta.Resize(iNumLab, iNumPos); + + for (int t = iNumPos - 1; t >= 0; t--) + { +#pragma omp parallel for + for (int k = 0; k < iNumLab; k++) + { + _rcrfBackwardCompute(t, k, alpha, beta, pair_scores); + } + } + + }; + + /// the kernel function for RCRF backward computation + template + void CPUMatrix::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix& alpha, + CPUMatrix& beta, + const CPUMatrix& pair_scores) + { + size_t iNumLab = alpha.GetNumRows(); + size_t iNumPos = alpha.GetNumCols(); + + ElemType fSum; + ElemType fTmp = (ElemType)LZERO; + if (t == iNumPos - 1) + { + fSum = (ElemType)LZERO; + for (int j = 0; j < iNumLab; j++) + { + fSum = (ElemType)logadd((double)fSum, alpha(j, t)); + } + + fTmp = alpha(k, t) - fSum; + beta(k, t) = fTmp; + } + else + { + for (int j = 0; j < iNumLab; j++) + { + fSum = (ElemType)LZERO; + for (int m = 0; m < iNumLab; m++) + { + fSum = (ElemType)logadd((double)fSum, alpha(m, t) + pair_scores(j, m)); + } + + fTmp = (ElemType)logadd(fTmp, beta(j, t + 1) + alpha(k, t) + pair_scores(j, k) - fSum); + } + beta(k, t) = fTmp; + } + } + + template + void CPUMatrix::RCRFTransGrdCompute(const CPUMatrix& lbls, + const CPUMatrix& alpha, + const CPUMatrix& beta, + const CPUMatrix& pair_scores, + CPUMatrix& grd) + { + int iNumPos = (int)alpha.GetNumCols(); + int iNumLab = (int)alpha.GetNumRows(); + + int firstLbl = -1; + for (int ik = 0; ik < lbls.GetNumRows(); ik++) + if (lbls(ik, 0) != 0){ + firstLbl = ik; break; + } + + for (size_t tPos = 0; tPos < iNumPos; tPos++) + { + CPUMatrix b = beta.ColumnSlice(tPos, 1); + CPUMatrix a; + if (tPos > 0) + a = alpha.ColumnSlice(tPos - 1, 1); + +#pragma omp parallel for + for (int i = 0; i < iNumLab; i++){ + _rcrfTransGrdCompute(i, lbls, alpha, beta, pair_scores, grd, tPos); + } + + /// transition score + int i = -1; + if (tPos == 0) i = firstLbl; + else { + for (int ik = 0; ik < lbls.GetNumRows(); ik++) + if (lbls(ik, tPos - 1) != 0){ + i = ik; break; + } + } + + int j = -1; + for (int ik = 0; ik < lbls.GetNumRows(); ik++){ + if (lbls(ik, tPos) != 0){ + j = ik; break; + } + } + + grd(j, i) -= 1.0; + } + }; + + template + void CPUMatrix::_rcrfTransGrdCompute(size_t i, + const CPUMatrix& lbls, + const CPUMatrix& alpha, + const CPUMatrix& beta, + const CPUMatrix& pair_scores, + CPUMatrix& grd, + const size_t tPos /// position + ) + { + int iNumLab = (int)alpha.GetNumRows(); + + int firstLbl = -1; + for (int ik = 0; ik < lbls.GetNumRows(); ik++) + if (lbls(ik, 0) != 0){ + firstLbl = ik; break; + } + + CPUMatrix b = beta.ColumnSlice(tPos, 1); + CPUMatrix a; + if (tPos > 0) + a = alpha.ColumnSlice(tPos - 1, 1); + + { + ElemType fTmp = (ElemType)LZERO; + for (int j = 0; j < iNumLab; j++){ + if (tPos == 0){ + if (i == firstLbl){ + fTmp = 0; + } + else{ + fTmp = (ElemType)LZERO; + } + } + else{ + fTmp = a(i, 0); + } + fTmp += pair_scores(j, i); + + + ElemType fSum = (ElemType)LZERO; + for (int k = 0; k < iNumLab; k++){ + ElemType fTmp2; + if (tPos == 0){ + if (k == firstLbl){ + fTmp2 = 0; + } + else{ + fTmp2 = (ElemType)LZERO; + } + } + else{ + fTmp2 = a(k, 0); + } + fSum = (ElemType)logadd(fSum, fTmp2 + pair_scores(j, k)); + } + + fTmp -= fSum; + fTmp += b(j, 0); + + grd(j, i) += exp(fTmp); + } + } + }; + template + CPUMatrix& CPUMatrix::DropFrame(const CPUMatrix& label, const CPUMatrix& gamma, const ElemType & threshhold) + { + auto& us = *this; + if (us.GetNumCols() != gamma.GetNumCols() || us.GetNumRows() != gamma.GetNumRows()) + LogicError("DropFrame: target matrix is not in the same size as gamm matrix."); + +#pragma omp parallel for + foreach_column(j, label) + { + + bool dropframe = false; + foreach_row(i, label) + { + if (fabs(label(i, j) - 1.0f) < 0.1) + { + if (gamma(i, j) < threshhold) + dropframe = true; + break; + } + } + + foreach_row(i, label) + { + us(i, j) = 0.0f; + } + } + + return *this; + } + + template + CPUMatrix& CPUMatrix::AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix& label, + const CPUMatrix& dnnoutput, const CPUMatrix& gamma, ElemType alpha) + { + auto& us = *this; + foreach_coord(i, j, us) + us(i, j) += alpha * (label(i, j) - (1 - hsmoothingWeight)*dnnoutput(i, j) - hsmoothingWeight*gamma(i, j)); + return *this; + } + + template + int CPUMatrix::SetNumThreads(int numThreads) + { + if (numThreads == 0) //use default + return numThreads; + + int mthreads = (int)std::thread::hardware_concurrency(); + + if (numThreads <= 0) + numThreads = max(1, mthreads + numThreads); + if (numThreads > mthreads) + numThreads = mthreads; + +#ifdef _OPENMP + omp_set_num_threads(numThreads); + numThreads = omp_get_max_threads(); + +#ifndef USE_MKL + acmlsetnumthreads(numThreads); +#else + mkl_set_num_threads(numThreads); +#endif +#endif + return numThreads; + } + + // The explicit instantiation part + template class MATH_API CPUMatrix; + template class MATH_API CPUMatrix; + + // We use Matrix as the backing store for QuantizedMatrix + // Let's explicitly instantiate the methods we need for that purpose + template CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols); + template CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols, char* pArray, const size_t matrixFlags); + template CPUMatrix::CPUMatrix(); + template CPUMatrix::CPUMatrix(CPUMatrix const &); + template CPUMatrix::CPUMatrix(CPUMatrix&&); + template size_t CPUMatrix::LocateElement(size_t, size_t) const; + template CPUMatrix::~CPUMatrix(); + template CPUMatrix CPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const; + template CPUMatrix& CPUMatrix::operator=(CPUMatrix&&); + template void CPUMatrix::SetValue(const char); + template void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, char *pArray, size_t matrixFlags); + +}}}