ABFT PR from latest version of FBGEMM

Summary: PR for Sihuan's record after his internship ends Differential Revision: D23656360 fbshipit-source-id: 873dce53127475bdc002e66af7703071c8c05801
pytorch · sli049 · Sep 11, 2020 · Sep 11, 2020 · 539157a6876ff5da20df3a5e8a03799a80480a3f
commit 539157a6876ff5da20df3a5e8a03799a80480a3f
diff --git a/bench/BaselineAIm2ColFusedRequantizeBenchmark.cc b/bench/BaselineAIm2ColFusedRequantizeBenchmark.cc
diff --git a/bench/EncodedAPackedReq32Benchmark.cc b/bench/EncodedAPackedReq32Benchmark.cc
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_MKL
+#include <mkl.h>
+#endif
+
+#include "./BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+#include "src/RefImplementations.h"
+#include "test/QuantizationHelpers.h"
+
+using namespace std;
+using namespace fbgemm;
+
+void performance_test(bool abft_needed) {
+  // clang-format off
+  vector<vector<int>> shapes = {
+    // NOTE: clang-format wants to use a different formatting but the current
+    // formatting should be easier to read.
+    // m, n, k
+    #if 1
+    {156800, 4, 36},
+    {156800, 8, 36},
+    {156800, 16, 36},
+    {1, 128, 512},
+    {1, 1024, 256},
+    {1, 2048, 512},
+    {1, 4096, 1024},
+
+    {6, 256, 1024},
+    {6, 256, 2048},
+    {6, 512, 512},
+    {6, 1024, 256},
+    {6, 2048, 256},
+    {6, 2048, 512},
+    {6, 4096, 256},
+    {6, 4096, 1024},
+    {6, 4096, 2048},
+
+    {10, 2048, 256},
+    {10, 4096, 1024},
+
+    {20, 2048, 256},
+    {20, 4096, 1024},
+
+    {102, 1024, 512},
+    {102, 2323, 256},
+    {102, 512, 256},
+
+    {1, 800, 3200},
+    {1, 800, 8000},
+
+    {16, 256, 1500},
+    {16, 256, 1567},
+    {1, 128, 2876},
+    {16, 128, 1567},
+    {1, 128, 2722},
+    {16, 256, 512},
+    #endif
+  };
+  // clang-format on
+  bool flush = true;
+  std::vector<char> llc;
+
+  if (flush) {
+    llc.resize(128 * 1024 * 1024, 1.0);
+  }
+
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 10;
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+       << endl;
+  cout << setw(8) << "M, " << setw(8) << "N, " << setw(8) << "K, " << setw(22)
+       << "Packing (ms), " << setw(22) << "Kernel (ms), " << setw(22)
+       << "Postprocessing (ms), " << setw(22) << "Total (ms), " << setw(22)
+       << "Type, " << setw(5) << "GOPs" << endl;
+#else
+  cout << setw(8) << "M, " << setw(8) << "N, " << setw(8) << "K, " << setw(22)
+       << "Type, " << setw(5) << "GOPS" << endl;
+#endif
+
+  chrono::time_point<chrono::high_resolution_clock> start, end;
+  for (auto shape : shapes) {
+    int m = shape[0];
+    int n = shape[1];
+    int k = shape[2];
+
+    float alpha = 1.f, beta = 0.f;
+    aligned_vector<uint8_t> Aint8(m * k);
+
+    aligned_vector<int8_t> Bint8(k * n);
+
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size() + n);
+    aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size());
+
+    // A matrix
+    randFill<uint8_t>(Aint8, 0, 255);
+    // float Aint8_scale = 0.11;
+    int32_t Aint8_zero_point = 43;
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
+
+    randFill<int8_t>(Bint8, -128, 127);
+    avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
+
+    // float Bint8_scale = 0.49;
+    int32_t Bint8_zero_point = -30;
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
+
+    // computing column offset
+    vector<int32_t> col_offsets(n);
+    col_offsets_with_zero_pt_s8acc32_ref(
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
+
+    double nops = 2.0 * m * n * k;
+    double ttot = 0.0;
+    string runType;
+#ifdef USE_MKL
+    runType = "MKL_fp32";
+    ttot = measureWithWarmup(
+        [&]() {
+          cblas_sgemm(
+              CblasRowMajor,
+              CblasNoTrans,
+              CblasNoTrans,
+              m,
+              n,
+              k,
+              alpha,
+              Afp32.data(),
+              k,
+              Bfp32.data(),
+              n,
+              beta,
+              Cfp32_mkl.data(),
+              n);
+        },
+        NWARMUP,
+        NITER,
+        [&]() {
+          if (flush) {
+            llc_flush(llc);
+          }
+        });
+    ttot *= 1e9; // convert to ns
+
+    ((volatile char*)(llc.data()));
+
+    cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k
+         << ", ";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    cout << setw(20) << fixed << setprecision(3) << 0.0f << ", " << setw(20)
+         << 0.0f << ", " << setw(20) << 0.0f << ", " << setw(20) << 0.0f
+         << ", ";
+#endif
+    cout << setw(20) << runType << ", " << setw(5) << fixed << setprecision(1)
+         << nops / ttot << endl;
+#endif
+
+    vector<int32_t> row_offsets(m);
+
+    float C_multiplier = 0.1234;
+    int32_t C_zero_pt = 5;
+
+    matmul_u8i8acc32_ref(
+        m, n, k, k, n, n, Aint8.data(), Bint8.data(), Cint32_local.data());
+
+    row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
+
+    requantize_u8acc32_ref(
+        m,
+        n,
+        n,
+        Cint32_local.data(),
+        Cint8_local.data(),
+        &C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        &Bint8_zero_point,
+        row_offsets.data(),
+        col_offsets.data(),
+        nullptr, // bias
+        n); // ncols per quant group
+    // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
+    // unpacked");
+    // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
+    // "A unpacked");
+    // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
+    // m, n, n, "C int32");
+    // printMatrix(matrix_op_t::NoTranspose,
+    // Cint8_local.data(), m, n, n, "C requantized");
+    // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
+    // offsets before");
+
+    PackBMatrix<int8_t> packedBN(
+        matrix_op_t::NoTranspose, k, n, Bint8.data(), n, nullptr, 1);
+
+    ttot = 0.0;
+    runType = "FBGEMM_i8_acc32";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    double total_packing_time = 0.0;
+    double total_computing_time = 0.0;
+    double total_kernel_time = 0.0;
+    double total_postprocessing_time = 0.0;
+    double total_run_time = 0.0;
+#endif
+    cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k
+         << ", ";
+
+    for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+      packing_time = 0.0;
+      computing_time = 0.0;
+      kernel_time = 0.0;
+      postprocessing_time = 0.0;
+      run_time = 0.0;
+#endif
+      llc_flush(llc);
+      start = chrono::high_resolution_clock::now();
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+      {
+        vector<int32_t> row_offset_buf(
+            PackEncodedAWithRowOffset<uint8_t>::rowOffsetBufferSize());
+
+        PackEncodedAWithRowOffset<uint8_t> packAN(
+            matrix_op_t::NoTranspose,
+            m,
+            k,
+            Aint8.data(),
+            k,
+            nullptr,
+            1,
+            row_offset_buf.data());
+
+        DoNothing<> doNothingObj{};
+        ReQuantizeOutput<false> outputProcObj(
+            doNothingObj,
+            &C_multiplier,
+            C_zero_pt,
+            Aint8_zero_point,
+            &Bint8_zero_point,
+            packAN.getRowOffsetBuffer(),
+            col_offsets.data(),
+            nullptr,
+            n);
+
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
+        // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
+        fbgemmPacked(
+            packAN,
+            packedBN,
+            Cint8_fb.data(),
+            Cint32_buffer.data(),
+            n,
+            outputProcObj,
+            tid,
+            num_threads);
+      }
+
+      end = chrono::high_resolution_clock::now();
+
+      if (i >= NWARMUP) {
+        auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start);
+        ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        total_packing_time += packing_time;
+        total_computing_time += computing_time;
+        total_kernel_time += kernel_time;
+        total_postprocessing_time += postprocessing_time;
+        total_run_time += run_time;
+#endif
+      }
+    }
+    ((volatile char*)(llc.data()));
+    // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
+    // unpacked");
+    // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
+    // "A unpacked");
+    // printMatrix(matrix_op_t::NoTranspose, Cint8_local.data(),
+    // m, n, n, "C requantized after");
+    // printMatrix(matrix_op_t::NoTranspose,
+    // Cint8_fb.data(), m, n, n, "C fb");
+    // printMatrix(matrix_op_t::NoTranspose,
+    // col_offsets.data(), 1, n, n, "col offsets after");
+    // compare_buffers(row_offsets.data(), row_offset_buf.data(),
+    // row_offsets.size(), 5);
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    cout << setprecision(3) << setw(20)
+         << total_packing_time / (double)NITER / 1e6 << ", " << setw(20)
+         << total_kernel_time / (double)NITER / 1e6 << ", " << setw(20)
+         << total_postprocessing_time / (double)NITER / 1e6 << ", " << setw(20)
+         << total_run_time / (double)NITER / 1e6 << ", ";
+#endif
+    cout << setw(20) << runType << ", " << setw(5) << fixed << setprecision(1)
+         << NITER * nops / ttot << endl;
+    cout << endl;
+
+    if (abft_needed){
+      cout << "m, n, k: " << m << " " << n << " " << k <<" time with abft: " << setprecision(2) << ttot/(double)1e6 << endl;
+    } else {
+      cout << "m, n, k: " << m << " " << n << " " << k <<" time without abft: " << setprecision(2) << ttot/(double)1e6 << endl;
+    }
+
+#ifdef USE_MKL
+    compare_buffers(Cint8_local.data(), Cint8_fb.data(), m, n, n, 5);
+#endif
+  }
+}
+
+int main(int /* unused */, char** /* unused */) {
+#ifdef _OPENMP
+  // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
+#endif
+  performance_test(false); // no abft
+  //performance_test(true); // with abft
+  return 0;
+}