Skip to content

Commit

Permalink
Syncing with internal version. Fixes for Mac/clang build. Other minor…
Browse files Browse the repository at this point in the history
… fixes
  • Loading branch information
dskhudia committed Nov 4, 2018
1 parent 505eb84 commit 690dbc2
Show file tree
Hide file tree
Showing 15 changed files with 1,590 additions and 679 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ if(NOT TARGET asmjit)
#build asmjit
set(ASMJIT_STATIC ON)
add_subdirectory("${ASMJIT_SRC_DIR}" "${FBGEMM_BINARY_DIR}/asmjit")
set_property(TARGET asmjit PROPERTY POSITION_INDEPENDENT_CODE ON)
endif()

if(NOT TARGET cpuinfo)
Expand Down
29 changes: 20 additions & 9 deletions include/fbgemm/Fbgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ class PackMatrix {
return last_bcol_ != blockColSize();
}

~PackMatrix() {
virtual ~PackMatrix() {
if (bufAllocatedHere_) {
free(buf_);
}
Expand Down Expand Up @@ -286,7 +286,7 @@ class PackMatrix {
* accumulation type is int32.
*/
template <typename T, typename accT = std::int32_t>
class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
public:
using This = PackAMatrix<T, accT>;
using BaseType = PackMatrix<This, T, accT>;
Expand All @@ -306,7 +306,7 @@ class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
std::int32_t ld,
inpType* pmat = nullptr,
std::int32_t groups = 1,
accT zero_pt = 0);
std::int32_t zero_pt = 0);

/**
* Activation matrices are not constant so cannot amortize the cost of
Expand Down Expand Up @@ -361,7 +361,7 @@ class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
* type is int32.
*/
template <typename T, typename accT = std::int32_t>
class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
public:
using This = PackBMatrix<T, accT>;
using BaseType = PackMatrix<This, T, accT>;
Expand All @@ -381,7 +381,7 @@ class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
std::int32_t ld,
inpType* pmat = nullptr,
std::int32_t groups = 1,
accT zero_pt = 0);
std::int32_t zero_pt = 0);

/**
* Weight matrices are usually constant so worth pre-packing.
Expand Down Expand Up @@ -439,7 +439,8 @@ class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
* quantized.
*/
template <typename T, typename accT = std::int32_t>
class PackAWithIm2Col : public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
class PackAWithIm2Col final
: public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
public:
using This = PackAWithIm2Col<T, accT>;
using BaseType = PackMatrix<This, T, accT>;
Expand Down Expand Up @@ -499,7 +500,7 @@ class PackAWithIm2Col : public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
* The source matrix is already quantized.
*/
template <typename T, typename accT = std::int32_t>
class PackAWithRowOffset
class PackAWithRowOffset final
: public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
public:
using This = PackAWithRowOffset<T, accT>;
Expand Down Expand Up @@ -572,7 +573,7 @@ class PackAWithRowOffset
* The source matrix is in fp32 and quantized during packing.
*/
template <typename T, typename accT = std::int32_t>
class PackAWithQuantRowOffset
class PackAWithQuantRowOffset final
: public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
public:
using This = PackAWithQuantRowOffset<T, accT>;
Expand Down Expand Up @@ -935,7 +936,6 @@ void fbgemmPacked(
/**
* @brief Perform depthwise separable convolution
*/

template <
typename packingAMatrix,
typename packingBMatrix,
Expand All @@ -949,4 +949,15 @@ void convDepthwiseSeparable(
outT* out,
const processOutputType& output);

/**
* @brief Allocate __size bytes of uninitialized storage whose alignment is
* specified by __align.
*/
static void* fbgemmAlignedAlloc(size_t __align, size_t __size) {
void* aligned_mem;
if (posix_memalign(&aligned_mem, __align, __size))
return 0;
return aligned_mem;
}

} // namespace fbgemm2
129 changes: 77 additions & 52 deletions include/fbgemm/FbgemmFP16.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// upgraded to match with new fbgemm interface.

#include <cassert>
#include <cstdlib>
#include <memory>
#include <vector>

Expand All @@ -22,7 +23,7 @@ namespace fbgemm2 {
/// row-major format into
/// internal packed blocked-row major format
class PackedGemmMatrixFP16 {
public:
public:
// takes smat input mamtrix in row-major format;
// and packs it into gemm-friendly blocked format;
// allocate space and sets up all the internal variables;
Expand All @@ -32,30 +33,31 @@ class PackedGemmMatrixFP16 {
// before flushing into fp32
// the smaller the brow_, the higher overhead
// of flushing is
PackedGemmMatrixFP16(const matrix_op_t trans, const int nrow,
const int ncol, const float alpha,
const float *smat,
const int brow = 512)
PackedGemmMatrixFP16(
const matrix_op_t trans,
const int nrow,
const int ncol,
const float alpha,
const float* smat,
const int brow = 512)
: nrow_(nrow), ncol_(ncol), brow_(brow) {

bcol_ = 8 * 1; // hardwired

// set up internal packing parameters
nbrow_ = ((numRows() % blockRowSize()) == 0)
? (numRows() / blockRowSize())
: ((numRows() + blockRowSize()) / blockRowSize());
? (numRows() / blockRowSize())
: ((numRows() + blockRowSize()) / blockRowSize());
last_brow_ = ((nrow % blockRowSize()) == 0) ? blockRowSize()
: (nrow % blockRowSize());
: (nrow % blockRowSize());
nbcol_ = ((numCols() % blockColSize()) == 0)
? (numCols() / blockColSize())
: ((numCols() + blockColSize()) / blockColSize());
? (numCols() / blockColSize())
: ((numCols() + blockColSize()) / blockColSize());

if (numCols() != blockColSize() * nbcol_) {
#ifdef VLOG
VLOG(0)
<< "Packer warning: ncol(" << numCols()
<< ") is not a multiple of internal block size (" << blockColSize()
<< ")";
VLOG(0) << "Packer warning: ncol(" << numCols()
<< ") is not a multiple of internal block size ("
<< blockColSize() << ")";
VLOG(0)
<< "lefover is currently done via MKL: hence overhead will inccur";
#endif
Expand All @@ -64,7 +66,9 @@ class PackedGemmMatrixFP16 {
// allocate and initialize packed memory
const int padding = 1024; // required by sw pipelined kernels
size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_);
pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) + padding);
// pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) +
// padding);
posix_memalign((void**)&pmat_, 64, matSize() * sizeof(float16) + padding);
for (auto i = 0; i < matSize(); i++) {
pmat_[i] = tconv(0.f, pmat_[i]);
}
Expand All @@ -77,7 +81,7 @@ class PackedGemmMatrixFP16 {
free(pmat_);
}

// protected:
// protected:
// blocked row-major format address arithmetic
uint64_t addr(const int r_, const int c_) const {
uint64_t r = (uint64_t)r_;
Expand All @@ -87,10 +91,9 @@ class PackedGemmMatrixFP16 {
brow_offset =
(block_row_id * nbcol_) * (blockRowSize() * blockColSize());
uint64_t block_col_id = c / blockColSize(),
bcol_offset =
block_col_id * ((block_row_id != nbrow_ - 1)
? (blockRowSize() * blockColSize())
: (last_brow_ * blockColSize()));
bcol_offset = block_col_id *
((block_row_id != nbrow_ - 1) ? (blockRowSize() * blockColSize())
: (last_brow_ * blockColSize()));
uint64_t block_offset = brow_offset + bcol_offset;
uint64_t inblock_offset =
r % blockRowSize() * blockColSize() + c % blockColSize();
Expand All @@ -100,61 +103,83 @@ class PackedGemmMatrixFP16 {
return index;
}

void packFromSrc(const matrix_op_t trans, const float alpha,
const float *smat) {
void
packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) {
bool tr = (trans == matrix_op_t::Transpose);
// pack
for (int i = 0; i < numRows(); i++) {
for (int j = 0; j < numCols(); j++) {
pmat_[addr(i, j)] = tconv(
alpha * (
(tr == false)
? smat[i * numCols() + j] : smat[i + numRows() * j]),
alpha *
((tr == false) ? smat[i * numCols() + j]
: smat[i + numRows() * j]),
pmat_[addr(i, j)]);
}
}
}

const float16 &operator()(const int r, const int c) const {
const float16& operator()(const int r, const int c) const {
uint64_t a = addr(r, c);
assert(r < numRows());
assert(c < numCols());
assert(a < this->matSize());
return pmat_[a];
}

int matSize() const { return size_; }
int numRows() const { return nrow_; }
int numCols() const { return ncol_; }
inline int blockRowSize() const { return brow_; }
inline int blockColSize() const { return bcol_; }
int matSize() const {
return size_;
}
int numRows() const {
return nrow_;
}
int numCols() const {
return ncol_;
}
inline int blockRowSize() const {
return brow_;
}
inline int blockColSize() const {
return bcol_;
}

int nrow_, ncol_;
int brow_, last_brow_, bcol_;
int nbrow_, nbcol_;
uint64_t size_;
float16 *pmat_;

friend void cblas_gemm_compute(const matrix_op_t transa, const int m,
const float *A,
const PackedGemmMatrixFP16 &Bp,
const float beta, float *C);
friend void cblas_gemm_compute(const matrix_op_t transa, const int m,
const float *A,
const PackedGemmMatrixFP16 &Bp,
const float beta, float *C);
float16* pmat_;

friend void cblas_gemm_compute(
const matrix_op_t transa,
const int m,
const float* A,
const PackedGemmMatrixFP16& Bp,
const float beta,
float* C);
friend void cblas_gemm_compute(
const matrix_op_t transa,
const int m,
const float* A,
const PackedGemmMatrixFP16& Bp,
const float beta,
float* C);
};

/**
* restrictions: transa == CblasNoTrans
*/
extern void cblas_gemm_compute(const matrix_op_t transa, const int m,
const float *A,
const PackedGemmMatrixFP16 &Bp,
const float beta, float *C);
extern void cblas_gemm_compute(const matrix_op_t transa, const int m,
const float *A,
const PackedGemmMatrixFP16 &Bp,
const float beta, float *C);

}; // namespace fbgemm
extern void cblas_gemm_compute(
const matrix_op_t transa,
const int m,
const float* A,
const PackedGemmMatrixFP16& Bp,
const float beta,
float* C);
extern void cblas_gemm_compute(
const matrix_op_t transa,
const int m,
const float* A,
const PackedGemmMatrixFP16& Bp,
const float beta,
float* C);

}; // namespace fbgemm2
10 changes: 6 additions & 4 deletions src/FbgemmFP16.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "fbgemm/FbgemmFP16.h"

#include <cpuinfo.h>
#include <array>
#include <utility>

#include "FbgemmFP16UKernels.h"

Expand Down Expand Up @@ -44,7 +46,7 @@ struct KernelInfo {

// autotuned kernel splits for various cases m = 1:mb_max
// may need re-autotuning for new uarch
static constexpr array<array<pair<int, int>, 2>, 121 > partition = {
static constexpr array<array<array<int, 2>, 2>, 121 > partition = {
{
{{ { 0, 0 }, { 0, 0 } } },
{{ { 1, 1 }, { 0, 0 } } },
Expand Down Expand Up @@ -171,7 +173,7 @@ struct KernelInfo {
};
};
constexpr array<KernelInfo::knl_ptr, 15> KernelInfo::kernel;
constexpr array<array<pair<int, int>, 2>, 121 > KernelInfo::partition;
constexpr array<array<array<int, 2>, 2>, 121 > KernelInfo::partition;

// autotuned kernel splits for various cases m = 1:mb_max
void
Expand Down Expand Up @@ -220,8 +222,8 @@ cblas_gemm_compute(const matrix_op_t transa, const int m, const float *A,
auto m1 = 0;
for (auto c = 0; c < 2; c++) {

auto kernel_nrows = KernelInfo::partition[mb][c].first;
auto nkernel_nrows = KernelInfo::partition[mb][c].second;
auto kernel_nrows = KernelInfo::partition[mb][c][0];
auto nkernel_nrows = KernelInfo::partition[mb][c][1];

auto m_start = m1, m_end = m1 + kernel_nrows * nkernel_nrows;
for (auto m2 = m_start; m2 < m_end; m2 += kernel_nrows) {
Expand Down
Loading

0 comments on commit 690dbc2

Please sign in to comment.