per-group and per-channel quantization (#14340)

Summary: Pull Request resolved: pytorch/pytorch#14340 Pull Request resolved: pytorch#25 Per-group and per-channel quantization in fbgemm This diff also cleans up explicit template instantiation using macro expansion This diff also changes randFill interface which was easy to make mistakes of generating integer random numbers for floating point vectors. Using this in DNNLOWP operators will be done in a separate diff. Reviewed By: dskhudia Differential Revision: D13176386 fbshipit-source-id: e46c53e31e21520bded71b8ed86e8b19e010e2dd
lifeiteng · Nov 27, 2018 · d4ee77f · d4ee77f
1 parent db52c82
commit d4ee77f
Show file tree

Hide file tree

Showing 24 changed files with 1,466 additions and 1,049 deletions.
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
@@ -5,30 +5,41 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "BenchUtils.h"
+
+#include <algorithm>
 #include <random>
+#include <type_traits>
+
+#include <omp.h>
 
 namespace fbgemm {
 
 std::default_random_engine eng;
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high) {
-  std::random_device r;
-  std::uniform_int_distribution<int> dis(low, high);
-  for (auto& v : vec) {
-    v = static_cast<T>(dis(eng));
-  }
+void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
+  std::uniform_int_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
+  std::uniform_real_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high) {
+  randFill(vec, low, high, std::is_integral<T>());
 }
 
 template void
-randFill<float>(aligned_vector<float>& vec, const int low, const int high);
-template void
-randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+randFill<float>(aligned_vector<float>& vec, float low, float high);
 template void
-randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
 template void
-randFill<int>(aligned_vector<int>& vec, const int low, const int high);
+randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
+template void randFill<int>(aligned_vector<int>& vec, int low, int high);
 
 void llc_flush(std::vector<char>& llc) {
   volatile char* data = llc.data();
@@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
   }
 }
 
+int fbgemm_get_num_threads() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 1;
+#else
+  return omp_get_num_threads();
+#endif
+}
+
+int fbgemm_get_thread_num() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 0;
+#else
+  return omp_get_thread_num();
+#endif
+}
+
 } // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
@@ -11,8 +11,11 @@
 namespace fbgemm {
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, T low, T high);
 
 void llc_flush(std::vector<char>& llc);
 
+int fbgemm_get_num_threads();
+int fbgemm_get_thread_num();
+
 } // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
@@ -62,10 +62,10 @@ int main() {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -129,13 +129,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
@@ -200,13 +195,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,

diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
@@ -161,10 +161,10 @@ int main() {
     aligned_vector<int8_t> B(G * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -221,13 +221,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
@@ -279,13 +274,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,

diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
@@ -73,20 +73,24 @@ void performance_test() {
     int n = s[1];
     int k = s[2];
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
-    aligned_vector<float> Cg(m * n, 1.f);
-    aligned_vector<float> Cp(m * n, NAN);
+    aligned_vector<float> C_ref(m * n, 1.f);
+    aligned_vector<float> C_fb(m * n, NAN);
 
     // initialize with small numbers
-    randFill(A, 0, 4);
+    aligned_vector<int> Aint(m * k);
+    randFill(Aint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
 
-    randFill(B, 0, 4);
+    aligned_vector<int> Bint(k * n);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> B(Bint.begin(), Bint.end());
     PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());
 
     if (beta != 0.0f) {
-      randFill(Cg, 0, 4);
-      Cp = Cg;
+      aligned_vector<int> Cint(C_ref.size());
+      randFill(Cint, 0, 4);
+      C_ref.assign(Cint.begin(), Cint.end());
+      C_fb = C_ref;
     }
 
     double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
@@ -111,17 +115,17 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
 #endif
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
 
 #ifdef USE_MKL
       // Compare results
-      for (auto i = 0; i < Cg.size(); i++) {
-        // printf("%f %f\n", Cg[i], Cp[i]);
-        assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
+      for (auto i = 0; i < C_ref.size(); i++) {
+        // printf("%f %f\n", C_ref[i], C_fb[i]);
+        assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
       }
 #endif
     }
@@ -151,7 +155,7 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
       t_end = chrono::system_clock::now();
       if (it >= 0) {
@@ -184,7 +188,7 @@ void performance_test() {
 
       t_begin = chrono::system_clock::now();
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
       t_end = chrono::system_clock::now();
 
       if (it >= 0) {

diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
@@ -77,7 +77,7 @@ int main() {
         cout << M << ", " << N << ", " << K << ", ";
 
         aligned_vector<uint8_t> A(M * K);
-        randFill(A, 0, 255);
+        randFill<uint8_t>(A, 0, 255);
 
         fbgemm::CompressedSparseColumn B_csc(K, N);
         vector<int32_t> C(M * N);
@@ -156,13 +156,8 @@ int main() {
 #pragma omp parallel
 #endif
           {
-#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
-            int num_threads = 1;
-            int tid = 0;
-#else
-            int num_threads = omp_get_num_threads();
-            int tid = omp_get_thread_num();
-#endif
+            int num_threads = fbgemm_get_num_threads();
+            int tid = fbgemm_get_thread_num();
             int i_per_thread =
                 ((M + 31) / 32 + num_threads - 1) / num_threads * 32;
             int i_begin = std::min(tid * i_per_thread, M);

diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -125,43 +125,29 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
-
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -184,8 +170,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(