forked from pytorch/FBGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Exposing tuning parameters in FBGEMM (MCB, NCB, KCB, MR, NR, Row Inte…
…rleave) (pytorch#90) Summary: Pull Request resolved: pytorch#90 Exposing tuning parameters in FBGEMM (MCB, NCB, KCB, MR, NR, Row Interleave) Reviewed By: dskhudia Differential Revision: D14358148 fbshipit-source-id: 783fb4653fd696dbbd4075ad56cb8682db3011a5
- Loading branch information
1 parent
d8e0d44
commit f12ec12
Showing
19 changed files
with
949 additions
and
248 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,339 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* All rights reserved. | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
#include <cpuinfo.h> | ||
#include <algorithm> | ||
#include <chrono> | ||
#include <cmath> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <vector> | ||
#include<set> | ||
|
||
#ifdef _OPENMP | ||
#include <omp.h> | ||
#endif | ||
|
||
#ifdef USE_MKL | ||
#include <mkl.h> | ||
#endif | ||
|
||
#include "bench/BenchUtils.h" | ||
#include "fbgemm/Fbgemm.h" | ||
#include "src/RefImplementations.h" | ||
#include "test/QuantizationHelpers.h" | ||
|
||
using namespace std; | ||
using namespace fbgemm; | ||
|
||
void performance_test( | ||
const BlockingFactors* tuning_params, | ||
set<vector<int>>& incorrect_configs, | ||
const vector<int>& shape, | ||
array<int, 6>& best_config, | ||
float& giga_ops) { | ||
|
||
bool flush = true; | ||
std::vector<char> llc; | ||
|
||
if (flush) { | ||
llc.resize(128 * 1024 * 1024, 1.0); | ||
} | ||
|
||
constexpr int NWARMUP = 4; | ||
constexpr int NITER = 10; | ||
|
||
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN | ||
cout << "WARNING: the timer may be inaccurate when used by multiple threads." | ||
<< endl; | ||
cout << setw(8) << "M, " << setw(8) << "N, " << setw(8) << "K, " << setw(18) | ||
<< "Type, " << setw(18) << "Packing (us), " << setw(18) | ||
<< "Kernel (us), " << setw(18) << "Postproc (us), " << setw(18) | ||
<< "Total (us), " << setw(5) << "GOPs" << endl; | ||
#else | ||
#endif | ||
|
||
chrono::time_point<chrono::high_resolution_clock> start, end; | ||
|
||
int m = shape[0]; | ||
int n = shape[1]; | ||
int k = shape[2]; | ||
|
||
aligned_vector<uint8_t> Aint8(m * k); | ||
aligned_vector<int8_t> Bint8(k * n); | ||
aligned_vector<float> Cfp32_mkl(m * n); | ||
aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size()); | ||
aligned_vector<int32_t> Cint32_ref(Cfp32_mkl.size()); | ||
aligned_vector<int32_t> Cint32_fb_acc32(Cfp32_mkl.size()); | ||
aligned_vector<int32_t> Cint32_fb_acc16(Cfp32_mkl.size()); | ||
|
||
// A matrix | ||
randFill<uint8_t>(Aint8, 0, 5); | ||
aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); | ||
|
||
randFill<int8_t>(Bint8, -4, 4); | ||
avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); | ||
|
||
aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); | ||
|
||
double nops = 2.0 * static_cast<double>(NITER) * m * n * k; | ||
double ttot = 0.0; | ||
string runType; | ||
|
||
vector<int32_t> row_offsets(m); | ||
|
||
matmul_u8i8acc32_ref( | ||
m, n, k, k, n, n, Aint8.data(), Bint8.data(), Cint32_ref.data()); | ||
|
||
PackBMatrix<int8_t> packedB_int32( | ||
matrix_op_t::NoTranspose, | ||
k, | ||
n, | ||
Bint8.data(), | ||
n, | ||
nullptr, | ||
1, | ||
tuning_params); | ||
|
||
ttot = 0.0; | ||
runType = "FBGEMM_i8_acc32"; | ||
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN | ||
double total_packing_time = 0.0; | ||
double total_computing_time = 0.0; | ||
double total_kernel_time = 0.0; | ||
double total_postprocessing_time = 0.0; | ||
double total_run_time = 0.0; | ||
#endif | ||
|
||
for (auto i = 0; i < NWARMUP + NITER; ++i) { | ||
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN | ||
packing_time = 0.0; | ||
computing_time = 0.0; | ||
kernel_time = 0.0; | ||
postprocessing_time = 0.0; | ||
run_time = 0.0; | ||
#endif | ||
llc_flush(llc); | ||
start = chrono::high_resolution_clock::now(); | ||
|
||
#ifdef _OPENMP | ||
#pragma omp parallel | ||
#endif | ||
{ | ||
PackAMatrix<uint8_t> packA_int32( | ||
matrix_op_t::NoTranspose, | ||
m, | ||
k, | ||
Aint8.data(), | ||
k, | ||
nullptr, | ||
1, | ||
tuning_params); | ||
|
||
DoNothing<int32_t, int32_t> doNothing32BitObj; | ||
memCopy<> memcopyObj(doNothing32BitObj); | ||
int num_threads = fbgemm_get_num_threads(); | ||
int tid = fbgemm_get_thread_num(); | ||
// printf ( "tid: %d, num_threads: %d\n", tid, num_threads ); | ||
fbgemmPacked( | ||
packA_int32, | ||
packedB_int32, | ||
Cint32_fb_acc32.data(), | ||
Cint32_fb_acc32.data(), | ||
n, | ||
memcopyObj, | ||
tid, | ||
num_threads, | ||
tuning_params); | ||
} | ||
|
||
end = chrono::high_resolution_clock::now(); | ||
|
||
if (i >= NWARMUP) { | ||
auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start); | ||
ttot += dur.count(); | ||
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN | ||
total_packing_time += packing_time; | ||
total_computing_time += computing_time; | ||
total_kernel_time += kernel_time; | ||
total_postprocessing_time += postprocessing_time; | ||
total_run_time += run_time; | ||
#endif | ||
} | ||
} | ||
((volatile char*)(llc.data())); | ||
|
||
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN | ||
cout << ", " << setw(16) << total_packing_time / (double)NITER / 1e3 << ", " | ||
<< setw(16) << total_kernel_time / (double)NITER / 1e3 << ", " | ||
<< setw(16) << total_postprocessing_time / (double)NITER / 1e3 << ", " | ||
<< setw(16) << total_run_time / (double)NITER / 1e3; | ||
#endif | ||
|
||
if (compare_buffers( | ||
Cint32_ref.data(), Cint32_fb_acc32.data(), m, n, n, 5)) { | ||
vector<int> config = {tuning_params->MCB, | ||
tuning_params->NCB, | ||
tuning_params->KCB, | ||
tuning_params->MR, | ||
tuning_params->NR, | ||
tuning_params->ROW_INTERLEAVE}; | ||
incorrect_configs.insert(config); | ||
} else { | ||
cout << setw(5) << "MCB, " << setw(5) << "NCB, " << setw(5) << "KCB, " | ||
<< setw(5) << "MR, " << setw(5) << "NR, " << setw(5) << "ROW INT." | ||
<< endl; | ||
cout << setw(5) << tuning_params->MCB << setw(5) << tuning_params->NCB | ||
<< setw(5) << tuning_params->KCB << setw(5) << tuning_params->MR | ||
<< setw(5) << tuning_params->NR << setw(5) | ||
<< tuning_params->ROW_INTERLEAVE << endl; | ||
|
||
cout << setw(8) << "M, " << setw(8) << "N, " << setw(8) << "K, " | ||
<< setw(18) << "Type, " << setw(5) << "GOPS" << endl; | ||
cout << setw(6) << m << ", " << setw(6) << n << ", " << setw(6) << k | ||
<< ", " << setw(16) << runType; | ||
cout << ", " << setw(5) << fixed << setw(5) << setprecision(1) | ||
<< nops / ttot << endl; | ||
if ((nops/ttot) > giga_ops){ | ||
giga_ops = nops/ttot; | ||
best_config = {tuning_params->MCB, | ||
tuning_params->NCB, | ||
tuning_params->KCB, | ||
tuning_params->MR, | ||
tuning_params->NR, | ||
tuning_params->ROW_INTERLEAVE}; | ||
} | ||
} | ||
} | ||
|
||
int main(int /* unused */, char** /* unused */) { | ||
#ifdef _OPENMP | ||
// Use 1 thread unless OMP_NUM_THREADS is explicit set. | ||
const char* val = getenv("OMP_NUM_THREADS"); | ||
if (val == nullptr || !*val) { | ||
omp_set_num_threads(1); | ||
} | ||
#endif | ||
|
||
vector<vector<int>> shapes = { | ||
// NOTE: clang-format wants to use a different formatting but the current | ||
// formatting should be easier to read. | ||
// m, n, k | ||
//warning these take time to run! | ||
{156800, 4, 36}, | ||
{156800, 8, 36}, | ||
{156800, 16, 36}, | ||
{1, 128, 512}, | ||
{1, 1024, 256}, | ||
{1, 2048, 512}, | ||
{1, 4096, 1024}, | ||
{6, 256, 1024}, | ||
{6, 256, 2048}, | ||
{6, 512, 512}, | ||
{6, 1024, 256}, | ||
{6, 2048, 256}, | ||
{6, 2048, 512}, | ||
{6, 4096, 256}, | ||
{6, 4096, 1024}, | ||
{6, 4096, 2048}, | ||
|
||
{10, 2048, 256}, | ||
{10, 4096, 1024}, | ||
|
||
{20, 2048, 256}, | ||
{20, 4096, 1024}, | ||
|
||
{102, 1024, 512}, | ||
{102, 2323, 256}, | ||
{102, 512, 256}, | ||
|
||
{1, 800, 3200}, | ||
{1, 800, 8000}, | ||
|
||
{16, 256, 1500}, | ||
{16, 256, 1567}, | ||
{1, 128, 2876}, | ||
{16, 128, 1567}, | ||
{1, 128, 2722}, | ||
|
||
{16, 256, 512}, | ||
{64, 800, 320}, | ||
{64, 768, 512}, | ||
{16, 256, 512}, | ||
{128, 128, 128}, | ||
{256, 512, 256}, | ||
{1024, 1024, 1024}, | ||
}; | ||
|
||
vector<int> MCBs; | ||
vector<int> NCBs; | ||
vector<int> KCBs; | ||
vector<int> MRs; | ||
int NR = 16; | ||
int NR_MIN = 16; | ||
int ROW_INTERLEAVE = 4; // do 32-bit accumulation for now | ||
|
||
if (cpuinfo_initialize()) { | ||
if (fbgemmHasAvx512Support()) { | ||
NR = 16; | ||
MCBs.insert(MCBs.end(), {48, 96, 144, 192, 240}); | ||
NCBs.insert(NCBs.end(), {16, 32, 64, 128, 48, 98, 192, 384}); | ||
KCBs.insert( | ||
KCBs.end(), | ||
{256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 960, 1024}); | ||
MRs.insert(MRs.end(), {24, 12, 6, 3, 8, 4, 2, 1}); | ||
} else if (fbgemmHasAvx2Support()) { | ||
assert(0 && "Benchmark will be extended for this architecture"); | ||
} else { | ||
assert(0 && "architecture not supported"); | ||
return 0; | ||
} | ||
} | ||
|
||
set<vector<int>> incorrect_configs; | ||
float giga_ops = 0.0; | ||
array<int, 6> best_config = {0, 0, 0, 0, 0, 0}; | ||
BlockingFactors params; | ||
for (auto const& shape : shapes) { | ||
for (auto const& mcb : MCBs) { | ||
for (auto const& ncb : NCBs) { | ||
for (auto const& kcb : KCBs) { | ||
for (auto const& mr : MRs) { | ||
params.MCB = mcb; | ||
params.NCB = ncb; | ||
params.KCB = kcb; | ||
params.MR = mr; | ||
params.NR = NR; | ||
params.ROW_INTERLEAVE = ROW_INTERLEAVE; | ||
params.NR_MIN = NR_MIN; | ||
if (isValidBlockingFactor<int32_t>(¶ms)) { | ||
performance_test( | ||
¶ms, incorrect_configs, shape, best_config, giga_ops); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
cout << endl << "This is the Best Config!" << endl; | ||
cout << setw(5) << "MCB, " << setw(5) << "NCB, " << setw(5) << "KCB, " | ||
<< setw(5) << "MR, " << setw(5) << "NR, " << setw(5) << "ROW INT." | ||
<< setw(5) << "GOPS" << endl; | ||
cout << setw(5) << best_config[0] << setw(5) << best_config[1] << setw(5) | ||
<< best_config[2] << setw(5) << best_config[3] << setw(5) | ||
<< best_config[4] << setw(5) << best_config[5] << giga_ops << endl; | ||
} // end shapes | ||
|
||
cout << endl << "Warning there are configs that didn't work!" << endl; | ||
for (auto const& entry : incorrect_configs) { | ||
cout << setw(5) << "MCB, " << setw(5) << "NCB, " << setw(5) << "KCB, " | ||
<< setw(5) << "MR, " << setw(5) << "NR, " << setw(5) << "ROW INT." | ||
<< endl; | ||
cout << setw(5) << entry[0] << setw(5) << entry[1] << setw(5) << entry[2] | ||
<< setw(5) << entry[3] << setw(5) << entry[4] << setw(5) << entry[5] | ||
<< endl; | ||
} | ||
return 0; | ||
} |
Oops, something went wrong.