Skip to content

Commit

Permalink
Move PackedMatrixB into separate file (pytorch#436)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#436

extracting the matrix definition and making it a template

Reviewed By: dskhudia

Differential Revision: D22152350

fbshipit-source-id: afb1836cb142835f3cac11f29469e7ed5449808a
  • Loading branch information
efiks authored and facebook-github-bot committed Oct 15, 2020
1 parent be57354 commit 48bc1b2
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 248 deletions.
5 changes: 3 additions & 2 deletions defs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,24 @@ def get_fbgemm_generic_srcs(with_base = False):

def get_fbgemm_public_headers():
return [
"include/fbgemm/ConvUtils.h",
"include/fbgemm/Fbgemm.h",
"include/fbgemm/FbgemmBuild.h",
"include/fbgemm/FbgemmFP16.h",
"include/fbgemm/FbgemmEmbedding.h",
"include/fbgemm/FbgemmConvert.h",
"include/fbgemm/FbgemmI64.h",
"include/fbgemm/FbgemmI8DepthwiseAvx2.h",
"include/fbgemm/FbgemmI8Spmdm.h",
"include/fbgemm/FbgemmPackMatrixB.h",
"include/fbgemm/OutputProcessing-inl.h",
"include/fbgemm/PackingTraits-inl.h",
"include/fbgemm/QuantUtils.h",
"include/fbgemm/QuantUtilsAvx2.h",
"include/fbgemm/QuantUtilsAvx512.h",
"include/fbgemm/Utils.h",
"include/fbgemm/UtilsAvx2.h",
"include/fbgemm/ConvUtils.h",
"include/fbgemm/Types.h",
"include/fbgemm/FbgemmI8Spmdm.h",
]

def get_fbgemm_avx2_srcs(msvc = False):
Expand Down
248 changes: 2 additions & 246 deletions include/fbgemm/FbgemmFP16.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@

#include "./Types.h"
#include "./Utils.h"
#include "./FbgemmPackMatrixB.h"

namespace fbgemm {

// forward declaration
class PackedGemmMatrixFP16;

using PackedGemmMatrixFP16 = PackedGemmMatrixB<float16>;
/**
* restrictions: transa == CblasNoTrans
*/
Expand All @@ -37,247 +36,4 @@ FBGEMM_API void cblas_gemm_compute(
int thread_id = 0,
int num_threads = 1);

/// class that performs packing of matrix in
/// row-major format into
/// internal packed blocked-row major format
class PackedGemmMatrixFP16 {
public:
using value_type = float16;
using size_type = uint64_t;

// takes smat input mamtrix in row-major format;
// packs it into gemm-friendly blocked format;
// allocate space and sets up all the internal variables;
// also premultiplies by alpha during packing.
// brow_ contains tile size along k dimension
// and also is # of fmas updates into int16 container
// before flushing into fp32.
// the smaller the brow_, the higher overhead
// of flushing is.
// kernel_ncol_blocks is the number of column blocks (in the size of 8 fp16,
// or 128 bit, or 1 xmm register size) in the kernel. Because the batch size
// can be dynamic and we need to prepack the weight matrix B, the internal
// packing layout of the weight matrix and kernel_ncol_blocks have to be
// fixed. We can choose kernel_ncol_blocks = 1 (with kernels of 1x1~14x1
// register layouts), 2 (with kernels of 1x2~6x2 register layout), or 3 (with
// kernels of 1x3~4x3 register layout).
PackedGemmMatrixFP16(
const matrix_op_t trans,
const int nrow,
const int ncol,
const float alpha,
const float* smat,
const int brow = 512)
: nrow_(nrow), ncol_(ncol), brow_(brow), kernel_ncol_blocks_(2) {
initializeParam();
initializeMemory();
// copy source matrix into packed matrix
this->packFromSrc(trans, alpha, smat);
}

PackedGemmMatrixFP16(
const int nrow,
const int ncol,
const int brow,
const int last_brow,
const int bcol,
const int nbrow,
const int nbcol,
const uint64_t size)
: nrow_(nrow),
ncol_(ncol),
brow_(brow),
last_brow_(last_brow),
bcol_(bcol),
nbrow_(nbrow),
nbcol_(nbcol),
size_(size),
kernel_ncol_blocks_(2) {
initializeMemory();
}

void initializeParam() {
if (!cpuinfo_initialize()) {
throw std::runtime_error("Failed to initialize cpuinfo!");
}
bcol_ = (isZmm(fbgemmInstructionSet())
? simd_info<inst_set_t::avx512>::WIDTH_32BIT_ELEMS
: simd_info<inst_set_t::avx2>::WIDTH_32BIT_ELEMS) *
kernelNumColBlocks();

// set up internal packing parameters
nbrow_ = (numRows() + blockRowSize() - 1) / blockRowSize();
last_brow_ = ((nrow_ % blockRowSize()) == 0) ? blockRowSize()
: (nrow_ % blockRowSize());
nbcol_ = (numCols() + blockColSize() - 1) / blockColSize();

if (numCols() != blockColSize() * nbcol_) {
#ifdef VLOG
VLOG(0) << "Packer warning: ncol(" << numCols()
<< ") is not a multiple of internal block size ("
<< blockColSize() << ")";
VLOG(0) << "lefover is not super optimized hence overhead will inccur";
#endif
}
}

void setPacked(bool p) {
packed_ = p;
}

bool packed() const {
return packed_;
}

void initializeMemory() {
// allocate and initialize packed memory
const int padding = 1024; // required by sw pipelined kernels
size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_);
pmat_ = static_cast<float16*>(
fbgemmAlignedAlloc(64, matSize() * sizeof(float16) + padding));
for (auto i = 0; i < matSize(); i++) {
pmat_[i] = cpu_float2half_rn(0.0f);
}
}

~PackedGemmMatrixFP16() {
fbgemmAlignedFree(pmat_);
}

void unpackFromSrc(const matrix_op_t trans, float16* src_mat) {
bool tr = (trans == matrix_op_t::Transpose);
for (int i = 0; i < numRows(); i++) {
for (int j = 0; j < numCols(); j++) {
pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)];
}
}
packed_ = false;
}

void unpack(float16* origin_buf, const matrix_op_t trans) {
assert(packed_);
bool tr = (trans == matrix_op_t::Transpose);
for (int i = 0; i < numRows(); i++) {
for (int j = 0; j < numCols(); j++) {
origin_buf[tr ? i + numRows() * j : i * numCols() + j] =
pmat_[addr(i, j)];
}
}
}

// protected:
// blocked row-major format address arithmetic
uint64_t addr(const int r_, const int c_) const {
uint64_t r = (uint64_t)r_;
uint64_t c = (uint64_t)c_;

uint64_t block_row_id = r / blockRowSize(),
brow_offset =
(block_row_id * nbcol_) * (blockRowSize() * blockColSize());
uint64_t block_col_id = c / blockColSize(),
bcol_offset = block_col_id *
((block_row_id != nbrow_ - 1) ? (blockRowSize() * blockColSize())
: (last_brow_ * blockColSize()));
uint64_t block_offset = brow_offset + bcol_offset;
uint64_t inblock_offset =
r % blockRowSize() * blockColSize() + c % blockColSize();

uint64_t index = block_offset + inblock_offset;
assert(index < matSize());
return index;
}

void
packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) {
bool tr = (trans == matrix_op_t::Transpose);
// pack
for (int i = 0; i < numRows(); i++) {
for (int j = 0; j < numCols(); j++) {
constexpr float FP16_MAX = 65504.f;
float src = alpha *
((tr == false) ? smat[i * numCols() + j] : smat[i + numRows() * j]);
src = std::max(-FP16_MAX, std::min(src, FP16_MAX));
pmat_[addr(i, j)] = cpu_float2half_rn(src);
}
}
packed_ = true;
}

// This function takes in an unpacked float16 matrix of the same size and
// packs it. There is no floating type conversion.
void packFromSrc(const matrix_op_t trans, const float16* smat) {
bool tr = (trans == matrix_op_t::Transpose);
for (int i = 0; i < numRows(); ++i) {
for (int j = 0; j < numCols(); ++j) {
pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j];
}
}
packed_ = true;
}

const float16& operator()(const int r, const int c) const {
uint64_t a = addr(r, c);
assert(r < numRows());
assert(c < numCols());
assert(a < this->matSize());
return pmat_[a];
}

int matSize() const {
return size_;
}
int numRows() const {
return nrow_;
}
int numCols() const {
return ncol_;
}
int lastBrow() const {
return last_brow_;
}
int numBrow() const {
return nbrow_;
}
int numBcol() const {
return nbcol_;
}
float16* pmat() const {
return pmat_;
}
inline int blockRowSize() const {
return brow_;
}
inline int blockColSize() const {
return bcol_;
}
inline int kernelNumColBlocks() const {
return kernel_ncol_blocks_;
}

const value_type* data() const {
return pmat_;
}

uint64_t size() const {
return size_ / sizeof(value_type);
}

int nrow_, ncol_;
int brow_, last_brow_, bcol_;
int nbrow_, nbcol_;
uint64_t size_;
int kernel_ncol_blocks_;
float16* pmat_;
bool packed_{false};

friend void cblas_gemm_compute(
const matrix_op_t transa,
const int m,
const float* A,
const PackedGemmMatrixFP16& Bp,
const float beta,
float* C,
int thread_id,
int num_threads);
};
}; // namespace fbgemm
Loading

0 comments on commit 48bc1b2

Please sign in to comment.