Skip to content

Commit

Permalink
per-group and per-channel quantization (#14340)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch/pytorch#14340

Pull Request resolved: pytorch#25

Per-group and per-channel quantization in fbgemm
This diff also cleans up explicit template instantiation using macro expansion
This diff also changes randFill interface which was easy to make mistakes of generating integer random numbers for floating point vectors.

Using this in DNNLOWP operators will be done in a separate diff.

Reviewed By: dskhudia

Differential Revision: D13176386

fbshipit-source-id: e46c53e31e21520bded71b8ed86e8b19e010e2dd
  • Loading branch information
jspark1105 authored and facebook-github-bot committed Nov 27, 2018
1 parent db52c82 commit d4ee77f
Show file tree
Hide file tree
Showing 24 changed files with 1,466 additions and 1,049 deletions.
51 changes: 39 additions & 12 deletions bench/BenchUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,41 @@
* LICENSE file in the root directory of this source tree.
*/
#include "BenchUtils.h"

#include <algorithm>
#include <random>
#include <type_traits>

#include <omp.h>

namespace fbgemm {

std::default_random_engine eng;

template <typename T>
void randFill(aligned_vector<T>& vec, const int low, const int high) {
std::random_device r;
std::uniform_int_distribution<int> dis(low, high);
for (auto& v : vec) {
v = static_cast<T>(dis(eng));
}
void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
std::uniform_int_distribution<T> dis(low, high);
std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
}

template <typename T>
void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
std::uniform_real_distribution<T> dis(low, high);
std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
}

template <typename T>
void randFill(aligned_vector<T>& vec, T low, T high) {
randFill(vec, low, high, std::is_integral<T>());
}

template void
randFill<float>(aligned_vector<float>& vec, const int low, const int high);
template void
randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
randFill<float>(aligned_vector<float>& vec, float low, float high);
template void
randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);

randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
template void
randFill<int>(aligned_vector<int>& vec, const int low, const int high);
randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
template void randFill<int>(aligned_vector<int>& vec, int low, int high);

void llc_flush(std::vector<char>& llc) {
volatile char* data = llc.data();
Expand All @@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
}
}

int fbgemm_get_num_threads() {
#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
return 1;
#else
return omp_get_num_threads();
#endif
}

int fbgemm_get_thread_num() {
#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
return 0;
#else
return omp_get_thread_num();
#endif
}

} // namespace fbgemm
5 changes: 4 additions & 1 deletion bench/BenchUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
namespace fbgemm {

template <typename T>
void randFill(aligned_vector<T>& vec, const int low, const int high);
void randFill(aligned_vector<T>& vec, T low, T high);

void llc_flush(std::vector<char>& llc);

int fbgemm_get_num_threads();
int fbgemm_get_thread_num();

} // namespace fbgemm
22 changes: 6 additions & 16 deletions bench/Depthwise3DBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ int main() {
aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
C(C_ref.size());

randFill(A, 0, 86);
randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;

randFill(B, -16, 16);
randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;

depthwise_3x3x3_pad_1_ref(
Expand Down Expand Up @@ -129,13 +129,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
#if _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
depthwise_3x3x3_pad_1(
N,
T,
Expand Down Expand Up @@ -200,13 +195,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
#if _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
depthwise_3x3x3_pad_1(
N,
T,
Expand Down
22 changes: 6 additions & 16 deletions bench/DepthwiseBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@ int main() {
aligned_vector<int8_t> B(G * R * S);
aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());

randFill(A, 0, 86);
randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;

randFill(B, -16, 16);
randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;

depthwise_3x3_pad_1_ref(
Expand Down Expand Up @@ -221,13 +221,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
#ifdef _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
depthwise_3x3_pad_1(
N,
H,
Expand Down Expand Up @@ -279,13 +274,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
#ifdef _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
depthwise_3x3_pad_1(
N,
H,
Expand Down
34 changes: 19 additions & 15 deletions bench/FP16Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,24 @@ void performance_test() {
int n = s[1];
int k = s[2];

aligned_vector<float> A(m * k, 0.f);
aligned_vector<float> B(k * n, 0.f);
aligned_vector<float> Cg(m * n, 1.f);
aligned_vector<float> Cp(m * n, NAN);
aligned_vector<float> C_ref(m * n, 1.f);
aligned_vector<float> C_fb(m * n, NAN);

// initialize with small numbers
randFill(A, 0, 4);
aligned_vector<int> Aint(m * k);
randFill(Aint, 0, 4);
aligned_vector<float> A(Aint.begin(), Aint.end());

randFill(B, 0, 4);
aligned_vector<int> Bint(k * n);
randFill(Bint, 0, 4);
aligned_vector<float> B(Bint.begin(), Bint.end());
PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());

if (beta != 0.0f) {
randFill(Cg, 0, 4);
Cp = Cg;
aligned_vector<int> Cint(C_ref.size());
randFill(Cint, 0, 4);
C_ref.assign(Cint.begin(), Cint.end());
C_fb = C_ref;
}

double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
Expand All @@ -111,17 +115,17 @@ void performance_test() {
B.data(),
(btran == matrix_op_t::NoTranspose) ? n : k,
beta,
Cg.data(),
C_ref.data(),
n);
#endif
cblas_gemm_compute(
matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());

#ifdef USE_MKL
// Compare results
for (auto i = 0; i < Cg.size(); i++) {
// printf("%f %f\n", Cg[i], Cp[i]);
assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
for (auto i = 0; i < C_ref.size(); i++) {
// printf("%f %f\n", C_ref[i], C_fb[i]);
assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
}
#endif
}
Expand Down Expand Up @@ -151,7 +155,7 @@ void performance_test() {
B.data(),
(btran == matrix_op_t::NoTranspose) ? n : k,
beta,
Cg.data(),
C_ref.data(),
n);
t_end = chrono::system_clock::now();
if (it >= 0) {
Expand Down Expand Up @@ -184,7 +188,7 @@ void performance_test() {

t_begin = chrono::system_clock::now();
cblas_gemm_compute(
matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
t_end = chrono::system_clock::now();

if (it >= 0) {
Expand Down
11 changes: 3 additions & 8 deletions bench/I8SpmdmBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ int main() {
cout << M << ", " << N << ", " << K << ", ";

aligned_vector<uint8_t> A(M * K);
randFill(A, 0, 255);
randFill<uint8_t>(A, 0, 255);

fbgemm::CompressedSparseColumn B_csc(K, N);
vector<int32_t> C(M * N);
Expand Down Expand Up @@ -156,13 +156,8 @@ int main() {
#pragma omp parallel
#endif
{
#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
int num_threads = 1;
int tid = 0;
#else
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#endif
int num_threads = fbgemm_get_num_threads();
int tid = fbgemm_get_thread_num();
int i_per_thread =
((M + 31) / 32 + num_threads - 1) / num_threads * 32;
int i_begin = std::min(tid * i_per_thread, M);
Expand Down
37 changes: 11 additions & 26 deletions bench/Im2ColFusedRequantizeAcc16Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,43 +125,29 @@ void performance_test() {

chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto conv_p : shapes) {
aligned_vector<float> Afp32(
conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
aligned_vector<uint8_t> Aint8(
conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);

conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
aligned_vector<uint8_t> Aint8_out(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
conv_p.K[1] * conv_p.IC,
0);
conv_p.K[1] * conv_p.IC);

aligned_vector<float> Bfp32(
conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
aligned_vector<int8_t> Bint8(
conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);

aligned_vector<int32_t> Cint32_ref(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);

aligned_vector<int32_t> Cint32_fb(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);

aligned_vector<int32_t> Cint32_fb2(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());

// A matrix (input activations)
randFill(Afp32, 0, 5);
randFill<uint8_t>(Aint8, 0, 5);
int32_t Aint8_zero_point = 4;
for (auto i = 0; i < Afp32.size(); ++i) {
Aint8[i] = static_cast<uint8_t>(Afp32[i]);
}
aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());

// B matrix (weights)
randFill(Bfp32, -4, 4);
randFill<int8_t>(Bint8, -4, 4);
// int32_t Bint8_zero_point = -3;
for (auto i = 0; i < Bfp32.size(); ++i) {
Bint8[i] = static_cast<int8_t>(Bfp32[i]);
}
aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());

// reference implementation
conv_ref(
Expand All @@ -184,8 +170,7 @@ void performance_test() {
double ttot = 0.0;
string runType;

vector<int32_t> row_offset_buf;
row_offset_buf.resize(
vector<int32_t> row_offset_buf(
PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());

PackAWithIm2Col<uint8_t, int16_t> packA(
Expand Down
Loading

0 comments on commit d4ee77f

Please sign in to comment.