Skip to content

Commit

Permalink
Make spmmfp32 and spmmi8 benchmark consistent with tests (pytorch#315)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#315

Making SPMM fp32 and int8 benchmarks consistent with tests to minimize confusion between M, N and K. Also this way the benchmark is consistent with FC op implementation in Caffe2.

Reviewed By: jiecaoyu

Differential Revision: D20400801

fbshipit-source-id: 1408985cb1ba24af1bfee9dbc0b6418758bb06f6
  • Loading branch information
dskhudia authored and facebook-github-bot committed Mar 13, 2020
1 parent cd6c889 commit a5dffd2
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 122 deletions.
32 changes: 32 additions & 0 deletions bench/BenchUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,36 @@ double measureWithWarmup(
return ttot / 1e9 / measuredIterations;
}

/*
* @brief Out-of-place transposition for M*N matrix ref.
* @param M number of rows in input
* @param K number of columns in input
*/
template <typename T>
void transpose_matrix(
int M,
int N,
const T* src,
int ld_src,
T* dst,
int ld_dst) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
dst[i * ld_dst + j] = src[i + j * ld_src];
}
} // for each output row
}

/*
* @brief In-place transposition for nxk matrix ref.
* @param n number of rows in input (number of columns in output)
* @param k number of columns in input (number of rows in output)
*/
template <typename T>
void transpose_matrix(T* ref, int n, int k) {
std::vector<T> local(n * k);
transpose_matrix(n, k, ref, k, local.data(), n);
memcpy(ref, local.data(), n * k * sizeof(T));
}

} // namespace fbgemm
55 changes: 35 additions & 20 deletions bench/SpMMFP32Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,55 +13,70 @@ using namespace std;
using namespace fbgemm;

int main(int, char**) {
vector<char> llc(128 * 1024 * 1024);

vector<vector<int>> shapes = {{1024, 128, 1024}};
vector<vector<int>> shapes = {{128, 1024, 1024}};

// C is MxN -> CT is NxM
// A is MxK -> BT is KxM
// B is KxN -> AT is NxK
// A is MxK -> AT is KxM
// B is KxN -> BT is NxK

// for (int s = 64; s <= 128; s *= 2)
for (auto const& s : shapes) {
int m = s[0];
int n = s[1];
int k = s[2];

int lda = k;
int ldb = n;
int ldc = n;

for (float fnz = 0.99; fnz >= 0.009999; fnz -= 0.01) {
auto aData = getRandomSparseVector(m * k, fnz);
auto bData = getRandomSparseVector(k * n);
auto aData = getRandomSparseVector(m * k);
auto bData = getRandomSparseVector(k * n, fnz);
auto cData = getRandomSparseVector(m * n);

auto fn = generateSpMM<float>(m, n, k, aData.data(), lda, ldb, ldc);
auto fn_varying_n = generateSpMM<float>(m, k, aData.data(), lda);
aligned_vector<float> atData(k * m);
aligned_vector<float> btData(n * k);
aligned_vector<float> ctData(n * m);

transpose_matrix(m, k, aData.data(), k, atData.data(), m);
transpose_matrix(k, n, bData.data(), n, btData.data(), k);

// We calculate C^T = B^T x A^T
// B matrix is sparse and passed in as first matrix to generateSpMM
int ldat = m;
int ldbt = k;
int ldct = m;
auto fn = generateSpMM<float>(n, m, k, btData.data(), ldbt, ldat, ldct);
auto fn_varying_n = generateSpMM<float>(n, k, btData.data(), ldbt);

double effective_flop = m * n * k * 2;

constexpr int NWARMUP = 5;
constexpr int NITER = 32;
auto secs = measureWithWarmup(
[&]() { fn(bData.data(), cData.data(), 0); },
[&]() { fn(atData.data(), ctData.data(), 0); },
NWARMUP,
NITER,
[&]() { llc_flush(llc); });
[&]() {
cache_evict(atData);
cache_evict(btData);
cache_evict(ctData);
});

auto secs_varying_n = measureWithWarmup(
[&]() {
fn_varying_n(
bData.data(),
cData.data(),
n,
n, /* ldb */
n, /* ldc */
atData.data(),
ctData.data(),
m,
ldat, /* ldat */
ldct, /* ldct */
0 /* accum_flag */);
},
NWARMUP,
NITER,
[&]() { llc_flush(llc); });
[&]() {
cache_evict(atData);
cache_evict(btData);
cache_evict(ctData);
});

double effective_gflops = effective_flop / secs / 1e9;
double effective_gflops_varying_n = effective_flop / secs_varying_n / 1e9;
Expand Down
75 changes: 51 additions & 24 deletions bench/SpMMI8Benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,62 +14,89 @@ using namespace std;
using namespace fbgemm;

int main(int, char**) {
vector<char> llc(128 * 1024 * 1024);

// vector<vector<unsigned>> shapes = {{64, 64, 64}};
// vector<vector<unsigned>> shapes = {{1, 16, 4}};

vector<vector<unsigned>> shapes = {{1024, 128, 1024}};
vector<vector<unsigned>> shapes = {{128, 1024, 1024}};

// C is MxN -> CT is NxM
// A is MxK -> BT is KxM
// B is KxN -> AT is NxK
// A is MxK -> AT is KxM
// B is KxN -> BT is NxK

// for (unsigned s = 64; s <= 128; s *= 2)
for (auto const& s : shapes) {
int m = s[0];
int n = s[1];
int k = s[2];

int lda = k;
int ldb = n;
int ldc = n;
if ((k % 4) != 0) {
cout << "Skipping shape " << m << ", " << n << ", " << k;
cout << " as K is not a multiple of 4" << endl;
continue;
}

for (float fnz = 0.99; fnz >= 0.009999; fnz -= 0.01) {
auto aData = getRandomSparseVector(m * k / 4, fnz);
auto bData = getRandomSparseVector(k * n / 4);
auto aData = getRandomSparseVector(m * k / 4);
auto bData = getRandomSparseVector(k * n / 4, fnz);
auto cData = getRandomSparseVector(m * n);

auto aptr = reinterpret_cast<const int8_t*>(aData.data());
auto bptr = reinterpret_cast<uint8_t*>(bData.data());
auto aptr = reinterpret_cast<uint8_t*>(aData.data());
auto bptr = reinterpret_cast<const int8_t*>(bData.data());
auto cptr = reinterpret_cast<int32_t*>(cData.data());

for (int i = 0; i < k * n; ++i) {
bptr[i] &= 0x7F;
for (int i = 0; i < k * m; ++i) {
aptr[i] &= 0x7F;
}

auto cptr = reinterpret_cast<int32_t*>(cData.data());
// We calculate C^T = B^T x A^T
// B matrix is sparse and passed in as first matrix to generateSpMM
int ldat = m;
int ldbt = k;
int ldct = m;

aligned_vector<float> atData(k / 4 * m);
aligned_vector<float> btData(n * k);
aligned_vector<float> ctData(n * m);

auto fn = generateSpMM<int32_t>(m, n, k, aptr, lda, ldb, ldc);
auto fn_varying_n = generateSpMM<int32_t>(m, k, aptr, lda);
auto atptr = reinterpret_cast<const uint8_t*>(atData.data());
auto btptr = reinterpret_cast<int8_t*>(btData.data());
auto ctptr = reinterpret_cast<int32_t*>(ctData.data());

// Transpose as if A is float so 4 columns are interleaved
transpose_matrix(m, k / 4, aData.data(), k / 4, atData.data(), ldat);
transpose_matrix(k, n, bptr, n, btptr, ldbt);

auto fn = generateSpMM<int32_t>(n, m, k, btptr, ldbt, ldat, ldct);
auto fn_varying_n = generateSpMM<int32_t>(n, k, btptr, ldbt);

double FLOPs = m * n * k * 2;

constexpr int NWARMUP = 5;
constexpr int NITER = 32;
auto secs = measureWithWarmup(
[&]() { fn(bptr, cptr, 0); },
[&]() { fn(atptr, ctptr, 0); },
NWARMUP,
NITER,
[&]() { llc_flush(llc); });
[&]() {
cache_evict(atData);
cache_evict(btData);
cache_evict(ctData);
});

auto secs_varying_n = measureWithWarmup(
[&]() {
fn_varying_n(
bptr, cptr, n, n /* ldb */, n /* ldc */, 0 /* accum_flag */);
atptr,
ctptr,
m,
ldat /* ldb */,
ldct /* ldc */,
0 /* accum_flag */);
},
NWARMUP,
NITER,
[&]() { llc_flush(llc); });
[&]() {
cache_evict(atData);
cache_evict(btData);
cache_evict(ctData);
});

cout << fnz << "," << (FLOPs / secs / 1e9) << ","
<< (fnz * FLOPs / secs / 1e9) << ","
Expand Down
56 changes: 0 additions & 56 deletions test/TestUtils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,60 +85,4 @@ check_all_zero_entries<int32_t>(const int32_t* test, int m, int n);
template bool
check_all_zero_entries<uint8_t>(const uint8_t* test, int m, int n);

template <typename T>
void transpose_matrix(
int M,
int N,
const T* src,
int ld_src,
T* dst,
int ld_dst) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
dst[i * ld_dst + j] = src[i + j * ld_src];
}
} // for each output row
}

template <typename T>
void transpose_matrix(T* ref, int n, int k) {
std::vector<T> local(n * k);
transpose_matrix(n, k, ref, k, local.data(), n);
memcpy(ref, local.data(), n * k * sizeof(T));
}

template void transpose_matrix<float>(
int M,
int N,
const float* src,
int ld_src,
float* dst,
int ld_dst);
template void transpose_matrix<int32_t>(
int M,
int N,
const int32_t* src,
int ld_src,
int32_t* dst,
int ld_dst);
template void transpose_matrix<uint8_t>(
int M,
int N,
const uint8_t* src,
int ld_src,
uint8_t* dst,
int ld_dst);
template void transpose_matrix<int8_t>(
int M,
int N,
const int8_t* src,
int ld_src,
int8_t* dst,
int ld_dst);

template void transpose_matrix<float>(float* ref, int n, int k);
template void transpose_matrix<int32_t>(int32_t* ref, int n, int k);
template void transpose_matrix<uint8_t>(uint8_t* ref, int n, int k);
template void transpose_matrix<int8_t>(int8_t* ref, int n, int k);

} // namespace fbgemm
22 changes: 0 additions & 22 deletions test/TestUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,4 @@ int compare_validate_buffers(
template <typename T>
bool check_all_zero_entries(const T* test, int m, int n);

/*
* @brief In-place transposition for nxk matrix ref.
* @param n number of rows in input (number of columns in output)
* @param k number of columns in input (number of rows in output)
*/
template <typename T>
void transpose_matrix(T* ref, int n, int k);

/*
* @brief Out-of-place transposition for M*N matrix ref.
* @param M number of rows in input
* @param K number of columns in input
*/
template <typename T>
void transpose_matrix(
int M,
int N,
const T* src,
int ld_src,
T* dst,
int ld_dst);

} // namespace fbgemm

0 comments on commit a5dffd2

Please sign in to comment.