Syncing with internal version. Fixes for Mac/clang build. Other minor…

… fixes
fbgheith · Nov 4, 2018 · 690dbc2 · 690dbc2
1 parent 505eb84
commit 690dbc2
Show file tree

Hide file tree

Showing 15 changed files with 1,590 additions and 679 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -93,6 +93,7 @@ if(NOT TARGET asmjit)
   #build asmjit
   set(ASMJIT_STATIC ON)
   add_subdirectory("${ASMJIT_SRC_DIR}" "${FBGEMM_BINARY_DIR}/asmjit")
+  set_property(TARGET asmjit PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
 if(NOT TARGET cpuinfo)

diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
@@ -236,7 +236,7 @@ class PackMatrix {
     return last_bcol_ != blockColSize();
   }
 
-  ~PackMatrix() {
+  virtual ~PackMatrix() {
     if (bufAllocatedHere_) {
       free(buf_);
     }
@@ -286,7 +286,7 @@ class PackMatrix {
  * accumulation type is int32.
  */
 template <typename T, typename accT = std::int32_t>
-class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
+class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
  public:
   using This = PackAMatrix<T, accT>;
   using BaseType = PackMatrix<This, T, accT>;
@@ -306,7 +306,7 @@ class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
       std::int32_t ld,
       inpType* pmat = nullptr,
       std::int32_t groups = 1,
-      accT zero_pt = 0);
+      std::int32_t zero_pt = 0);
 
   /**
    * Activation matrices are not constant so cannot amortize the cost of
@@ -361,7 +361,7 @@ class PackAMatrix : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
  *        type is int32.
  */
 template <typename T, typename accT = std::int32_t>
-class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
+class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
  public:
   using This = PackBMatrix<T, accT>;
   using BaseType = PackMatrix<This, T, accT>;
@@ -381,7 +381,7 @@ class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
       std::int32_t ld,
       inpType* pmat = nullptr,
       std::int32_t groups = 1,
-      accT zero_pt = 0);
+      std::int32_t zero_pt = 0);
 
   /**
    * Weight matrices are usually constant so worth pre-packing.
@@ -439,7 +439,8 @@ class PackBMatrix : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
  * quantized.
  */
 template <typename T, typename accT = std::int32_t>
-class PackAWithIm2Col : public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
+class PackAWithIm2Col final
+    : public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
  public:
   using This = PackAWithIm2Col<T, accT>;
   using BaseType = PackMatrix<This, T, accT>;
@@ -499,7 +500,7 @@ class PackAWithIm2Col : public PackMatrix<PackAWithIm2Col<T, accT>, T, accT> {
  *        The source matrix is already quantized.
  */
 template <typename T, typename accT = std::int32_t>
-class PackAWithRowOffset
+class PackAWithRowOffset final
     : public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
  public:
   using This = PackAWithRowOffset<T, accT>;
@@ -572,7 +573,7 @@ class PackAWithRowOffset
  *        The source matrix is in fp32 and quantized during packing.
  */
 template <typename T, typename accT = std::int32_t>
-class PackAWithQuantRowOffset
+class PackAWithQuantRowOffset final
     : public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
  public:
   using This = PackAWithQuantRowOffset<T, accT>;
@@ -935,7 +936,6 @@ void fbgemmPacked(
 /**
  * @brief Perform depthwise separable convolution
  */
-
 template <
     typename packingAMatrix,
     typename packingBMatrix,
@@ -949,4 +949,15 @@ void convDepthwiseSeparable(
     outT* out,
     const processOutputType& output);
 
+/**
+ * @brief Allocate __size bytes of uninitialized storage whose alignment is
+ * specified by __align.
+ */
+static void* fbgemmAlignedAlloc(size_t __align, size_t __size) {
+  void* aligned_mem;
+  if (posix_memalign(&aligned_mem, __align, __size))
+    return 0;
+  return aligned_mem;
+}
+
 } // namespace fbgemm2
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
@@ -10,6 +10,7 @@
 // upgraded to match with new fbgemm interface.
 
 #include <cassert>
+#include <cstdlib>
 #include <memory>
 #include <vector>
 
@@ -22,7 +23,7 @@ namespace fbgemm2 {
 /// row-major format into
 /// internal packed blocked-row major format
 class PackedGemmMatrixFP16 {
-public:
+ public:
   // takes smat input mamtrix in row-major format;
   // and packs it into gemm-friendly blocked format;
   // allocate space and sets up all the internal variables;
@@ -32,30 +33,31 @@ class PackedGemmMatrixFP16 {
   // before flushing into fp32
   // the smaller the brow_, the higher overhead
   // of flushing is
-  PackedGemmMatrixFP16(const matrix_op_t trans, const int nrow,
-                   const int ncol, const float alpha,
-                   const float *smat,
-                   const int brow = 512)
+  PackedGemmMatrixFP16(
+      const matrix_op_t trans,
+      const int nrow,
+      const int ncol,
+      const float alpha,
+      const float* smat,
+      const int brow = 512)
       : nrow_(nrow), ncol_(ncol), brow_(brow) {
-
     bcol_ = 8 * 1; // hardwired
 
     // set up internal packing parameters
     nbrow_ = ((numRows() % blockRowSize()) == 0)
-                ? (numRows() / blockRowSize())
-                : ((numRows() + blockRowSize()) / blockRowSize());
+        ? (numRows() / blockRowSize())
+        : ((numRows() + blockRowSize()) / blockRowSize());
     last_brow_ = ((nrow % blockRowSize()) == 0) ? blockRowSize()
-                                               : (nrow % blockRowSize());
+                                                : (nrow % blockRowSize());
     nbcol_ = ((numCols() % blockColSize()) == 0)
-                ? (numCols() / blockColSize())
-                : ((numCols() + blockColSize()) / blockColSize());
+        ? (numCols() / blockColSize())
+        : ((numCols() + blockColSize()) / blockColSize());
 
     if (numCols() != blockColSize() * nbcol_) {
 #ifdef VLOG
-      VLOG(0)
-          << "Packer warning: ncol(" << numCols()
-          << ") is not a multiple of internal block size (" << blockColSize()
-          << ")";
+      VLOG(0) << "Packer warning: ncol(" << numCols()
+              << ") is not a multiple of internal block size ("
+              << blockColSize() << ")";
       VLOG(0)
           << "lefover is currently done via MKL: hence overhead will inccur";
 #endif
@@ -64,7 +66,9 @@ class PackedGemmMatrixFP16 {
     // allocate and initialize packed memory
     const int padding = 1024; // required by sw pipelined kernels
     size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_);
-    pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) + padding);
+    // pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) +
+    // padding);
+    posix_memalign((void**)&pmat_, 64, matSize() * sizeof(float16) + padding);
     for (auto i = 0; i < matSize(); i++) {
       pmat_[i] = tconv(0.f, pmat_[i]);
     }
@@ -77,7 +81,7 @@ class PackedGemmMatrixFP16 {
     free(pmat_);
   }
 
-// protected:
+  // protected:
   // blocked row-major format address arithmetic
   uint64_t addr(const int r_, const int c_) const {
     uint64_t r = (uint64_t)r_;
@@ -87,10 +91,9 @@ class PackedGemmMatrixFP16 {
              brow_offset =
                  (block_row_id * nbcol_) * (blockRowSize() * blockColSize());
     uint64_t block_col_id = c / blockColSize(),
-             bcol_offset =
-                 block_col_id * ((block_row_id != nbrow_ - 1)
-                                     ? (blockRowSize() * blockColSize())
-                                     : (last_brow_ * blockColSize()));
+             bcol_offset = block_col_id *
+        ((block_row_id != nbrow_ - 1) ? (blockRowSize() * blockColSize())
+                                      : (last_brow_ * blockColSize()));
     uint64_t block_offset = brow_offset + bcol_offset;
     uint64_t inblock_offset =
         r % blockRowSize() * blockColSize() + c % blockColSize();
@@ -100,61 +103,83 @@ class PackedGemmMatrixFP16 {
     return index;
   }
 
-  void packFromSrc(const matrix_op_t trans, const float alpha,
-                   const float *smat) {
+  void
+  packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) {
     bool tr = (trans == matrix_op_t::Transpose);
     // pack
     for (int i = 0; i < numRows(); i++) {
       for (int j = 0; j < numCols(); j++) {
         pmat_[addr(i, j)] = tconv(
-            alpha * (
-              (tr == false)
-                ? smat[i * numCols() + j] : smat[i + numRows() * j]),
+            alpha *
+                ((tr == false) ? smat[i * numCols() + j]
+                               : smat[i + numRows() * j]),
             pmat_[addr(i, j)]);
       }
     }
   }
 
-  const float16 &operator()(const int r, const int c) const {
+  const float16& operator()(const int r, const int c) const {
     uint64_t a = addr(r, c);
     assert(r < numRows());
     assert(c < numCols());
     assert(a < this->matSize());
     return pmat_[a];
   }
 
-  int matSize() const { return size_; }
-  int numRows() const { return nrow_; }
-  int numCols() const { return ncol_; }
-  inline int blockRowSize() const { return brow_; }
-  inline int blockColSize() const { return bcol_; }
+  int matSize() const {
+    return size_;
+  }
+  int numRows() const {
+    return nrow_;
+  }
+  int numCols() const {
+    return ncol_;
+  }
+  inline int blockRowSize() const {
+    return brow_;
+  }
+  inline int blockColSize() const {
+    return bcol_;
+  }
 
   int nrow_, ncol_;
   int brow_, last_brow_, bcol_;
   int nbrow_, nbcol_;
   uint64_t size_;
-  float16 *pmat_;
-
-  friend void cblas_gemm_compute(const matrix_op_t transa, const int m,
-                                 const float *A,
-                                 const PackedGemmMatrixFP16 &Bp,
-                                 const float beta, float *C);
-  friend void cblas_gemm_compute(const matrix_op_t transa, const int m,
-                                 const float *A,
-                                 const PackedGemmMatrixFP16 &Bp,
-                                 const float beta, float *C);
+  float16* pmat_;
+
+  friend void cblas_gemm_compute(
+      const matrix_op_t transa,
+      const int m,
+      const float* A,
+      const PackedGemmMatrixFP16& Bp,
+      const float beta,
+      float* C);
+  friend void cblas_gemm_compute(
+      const matrix_op_t transa,
+      const int m,
+      const float* A,
+      const PackedGemmMatrixFP16& Bp,
+      const float beta,
+      float* C);
 };
 
 /**
  * restrictions: transa == CblasNoTrans
  */
-extern void cblas_gemm_compute(const matrix_op_t transa, const int m,
-                               const float *A,
-                               const PackedGemmMatrixFP16 &Bp,
-                               const float beta, float *C);
-extern void cblas_gemm_compute(const matrix_op_t transa, const int m,
-                               const float *A,
-                               const PackedGemmMatrixFP16 &Bp,
-                               const float beta, float *C);
-
-}; // namespace fbgemm
+extern void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixFP16& Bp,
+    const float beta,
+    float* C);
+extern void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixFP16& Bp,
+    const float beta,
+    float* C);
+
+}; // namespace fbgemm2
diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc
@@ -7,6 +7,8 @@
 #include "fbgemm/FbgemmFP16.h"
 
 #include <cpuinfo.h>
+#include <array>
+#include <utility>
 
 #include "FbgemmFP16UKernels.h"
 
@@ -44,7 +46,7 @@ struct KernelInfo {
 
   // autotuned kernel splits for various cases m = 1:mb_max
   // may need re-autotuning for new uarch
-  static constexpr array<array<pair<int, int>, 2>, 121 > partition = {
+  static constexpr array<array<array<int, 2>, 2>, 121 > partition = {
     {
       {{ { 0, 0 }, { 0, 0 } } },
       {{ { 1, 1 }, { 0, 0 } } },
@@ -171,7 +173,7 @@ struct KernelInfo {
   };
 };
 constexpr array<KernelInfo::knl_ptr, 15> KernelInfo::kernel;
-constexpr array<array<pair<int, int>, 2>, 121 > KernelInfo::partition;
+constexpr array<array<array<int, 2>, 2>, 121 > KernelInfo::partition;
 
 // autotuned kernel splits for various cases m = 1:mb_max
 void
@@ -220,8 +222,8 @@ cblas_gemm_compute(const matrix_op_t transa, const int m, const float *A,
       auto m1 = 0;
       for (auto c = 0; c < 2; c++) {
 
-        auto kernel_nrows = KernelInfo::partition[mb][c].first;
-        auto nkernel_nrows = KernelInfo::partition[mb][c].second;
+        auto kernel_nrows = KernelInfo::partition[mb][c][0];
+        auto nkernel_nrows = KernelInfo::partition[mb][c][1];
 
         auto m_start = m1, m_end = m1 + kernel_nrows * nkernel_nrows;
         for (auto m2 = m_start; m2 < m_end; m2 += kernel_nrows) {