diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h index ca9dbd0ca9..64519c5506 100644 --- a/bench/BenchUtils.h +++ b/bench/BenchUtils.h @@ -126,4 +126,36 @@ double measureWithWarmup( return ttot / 1e9 / measuredIterations; } +/* + * @brief Out-of-place transposition for M*N matrix ref. + * @param M number of rows in input + * @param K number of columns in input + */ +template +void transpose_matrix( + int M, + int N, + const T* src, + int ld_src, + T* dst, + int ld_dst) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + dst[i * ld_dst + j] = src[i + j * ld_src]; + } + } // for each output row +} + +/* + * @brief In-place transposition for nxk matrix ref. + * @param n number of rows in input (number of columns in output) + * @param k number of columns in input (number of rows in output) + */ +template +void transpose_matrix(T* ref, int n, int k) { + std::vector local(n * k); + transpose_matrix(n, k, ref, k, local.data(), n); + memcpy(ref, local.data(), n * k * sizeof(T)); +} + } // namespace fbgemm diff --git a/bench/SpMMFP32Benchmark.cc b/bench/SpMMFP32Benchmark.cc index 5d7d3a4e3b..badc323034 100644 --- a/bench/SpMMFP32Benchmark.cc +++ b/bench/SpMMFP32Benchmark.cc @@ -13,13 +13,12 @@ using namespace std; using namespace fbgemm; int main(int, char**) { - vector llc(128 * 1024 * 1024); - vector> shapes = {{1024, 128, 1024}}; + vector> shapes = {{128, 1024, 1024}}; // C is MxN -> CT is NxM - // A is MxK -> BT is KxM - // B is KxN -> AT is NxK + // A is MxK -> AT is KxM + // B is KxN -> BT is NxK // for (int s = 64; s <= 128; s *= 2) for (auto const& s : shapes) { @@ -27,41 +26,57 @@ int main(int, char**) { int n = s[1]; int k = s[2]; - int lda = k; - int ldb = n; - int ldc = n; - for (float fnz = 0.99; fnz >= 0.009999; fnz -= 0.01) { - auto aData = getRandomSparseVector(m * k, fnz); - auto bData = getRandomSparseVector(k * n); + auto aData = getRandomSparseVector(m * k); + auto bData = getRandomSparseVector(k * n, fnz); auto cData = getRandomSparseVector(m * n); - auto fn = generateSpMM(m, n, k, aData.data(), lda, ldb, ldc); - auto fn_varying_n = generateSpMM(m, k, aData.data(), lda); + aligned_vector atData(k * m); + aligned_vector btData(n * k); + aligned_vector ctData(n * m); + + transpose_matrix(m, k, aData.data(), k, atData.data(), m); + transpose_matrix(k, n, bData.data(), n, btData.data(), k); + + // We calculate C^T = B^T x A^T + // B matrix is sparse and passed in as first matrix to generateSpMM + int ldat = m; + int ldbt = k; + int ldct = m; + auto fn = generateSpMM(n, m, k, btData.data(), ldbt, ldat, ldct); + auto fn_varying_n = generateSpMM(n, k, btData.data(), ldbt); double effective_flop = m * n * k * 2; constexpr int NWARMUP = 5; constexpr int NITER = 32; auto secs = measureWithWarmup( - [&]() { fn(bData.data(), cData.data(), 0); }, + [&]() { fn(atData.data(), ctData.data(), 0); }, NWARMUP, NITER, - [&]() { llc_flush(llc); }); + [&]() { + cache_evict(atData); + cache_evict(btData); + cache_evict(ctData); + }); auto secs_varying_n = measureWithWarmup( [&]() { fn_varying_n( - bData.data(), - cData.data(), - n, - n, /* ldb */ - n, /* ldc */ + atData.data(), + ctData.data(), + m, + ldat, /* ldat */ + ldct, /* ldct */ 0 /* accum_flag */); }, NWARMUP, NITER, - [&]() { llc_flush(llc); }); + [&]() { + cache_evict(atData); + cache_evict(btData); + cache_evict(ctData); + }); double effective_gflops = effective_flop / secs / 1e9; double effective_gflops_varying_n = effective_flop / secs_varying_n / 1e9; diff --git a/bench/SpMMI8Benchmark.cc b/bench/SpMMI8Benchmark.cc index 19daf2de6b..754f2f1be1 100644 --- a/bench/SpMMI8Benchmark.cc +++ b/bench/SpMMI8Benchmark.cc @@ -14,62 +14,89 @@ using namespace std; using namespace fbgemm; int main(int, char**) { - vector llc(128 * 1024 * 1024); - // vector> shapes = {{64, 64, 64}}; - // vector> shapes = {{1, 16, 4}}; - - vector> shapes = {{1024, 128, 1024}}; + vector> shapes = {{128, 1024, 1024}}; // C is MxN -> CT is NxM - // A is MxK -> BT is KxM - // B is KxN -> AT is NxK + // A is MxK -> AT is KxM + // B is KxN -> BT is NxK - // for (unsigned s = 64; s <= 128; s *= 2) for (auto const& s : shapes) { int m = s[0]; int n = s[1]; int k = s[2]; - int lda = k; - int ldb = n; - int ldc = n; + if ((k % 4) != 0) { + cout << "Skipping shape " << m << ", " << n << ", " << k; + cout << " as K is not a multiple of 4" << endl; + continue; + } for (float fnz = 0.99; fnz >= 0.009999; fnz -= 0.01) { - auto aData = getRandomSparseVector(m * k / 4, fnz); - auto bData = getRandomSparseVector(k * n / 4); + auto aData = getRandomSparseVector(m * k / 4); + auto bData = getRandomSparseVector(k * n / 4, fnz); auto cData = getRandomSparseVector(m * n); - auto aptr = reinterpret_cast(aData.data()); - auto bptr = reinterpret_cast(bData.data()); + auto aptr = reinterpret_cast(aData.data()); + auto bptr = reinterpret_cast(bData.data()); + auto cptr = reinterpret_cast(cData.data()); - for (int i = 0; i < k * n; ++i) { - bptr[i] &= 0x7F; + for (int i = 0; i < k * m; ++i) { + aptr[i] &= 0x7F; } - auto cptr = reinterpret_cast(cData.data()); + // We calculate C^T = B^T x A^T + // B matrix is sparse and passed in as first matrix to generateSpMM + int ldat = m; + int ldbt = k; + int ldct = m; + + aligned_vector atData(k / 4 * m); + aligned_vector btData(n * k); + aligned_vector ctData(n * m); - auto fn = generateSpMM(m, n, k, aptr, lda, ldb, ldc); - auto fn_varying_n = generateSpMM(m, k, aptr, lda); + auto atptr = reinterpret_cast(atData.data()); + auto btptr = reinterpret_cast(btData.data()); + auto ctptr = reinterpret_cast(ctData.data()); + + // Transpose as if A is float so 4 columns are interleaved + transpose_matrix(m, k / 4, aData.data(), k / 4, atData.data(), ldat); + transpose_matrix(k, n, bptr, n, btptr, ldbt); + + auto fn = generateSpMM(n, m, k, btptr, ldbt, ldat, ldct); + auto fn_varying_n = generateSpMM(n, k, btptr, ldbt); double FLOPs = m * n * k * 2; constexpr int NWARMUP = 5; constexpr int NITER = 32; auto secs = measureWithWarmup( - [&]() { fn(bptr, cptr, 0); }, + [&]() { fn(atptr, ctptr, 0); }, NWARMUP, NITER, - [&]() { llc_flush(llc); }); + [&]() { + cache_evict(atData); + cache_evict(btData); + cache_evict(ctData); + }); auto secs_varying_n = measureWithWarmup( [&]() { fn_varying_n( - bptr, cptr, n, n /* ldb */, n /* ldc */, 0 /* accum_flag */); + atptr, + ctptr, + m, + ldat /* ldb */, + ldct /* ldc */, + 0 /* accum_flag */); }, NWARMUP, NITER, - [&]() { llc_flush(llc); }); + [&]() { + cache_evict(atData); + cache_evict(btData); + cache_evict(ctData); + }); cout << fnz << "," << (FLOPs / secs / 1e9) << "," << (fnz * FLOPs / secs / 1e9) << "," diff --git a/test/TestUtils.cc b/test/TestUtils.cc index d584433810..f516bd776d 100644 --- a/test/TestUtils.cc +++ b/test/TestUtils.cc @@ -85,60 +85,4 @@ check_all_zero_entries(const int32_t* test, int m, int n); template bool check_all_zero_entries(const uint8_t* test, int m, int n); -template -void transpose_matrix( - int M, - int N, - const T* src, - int ld_src, - T* dst, - int ld_dst) { - for (int i = 0; i < N; ++i) { - for (int j = 0; j < M; ++j) { - dst[i * ld_dst + j] = src[i + j * ld_src]; - } - } // for each output row -} - -template -void transpose_matrix(T* ref, int n, int k) { - std::vector local(n * k); - transpose_matrix(n, k, ref, k, local.data(), n); - memcpy(ref, local.data(), n * k * sizeof(T)); -} - -template void transpose_matrix( - int M, - int N, - const float* src, - int ld_src, - float* dst, - int ld_dst); -template void transpose_matrix( - int M, - int N, - const int32_t* src, - int ld_src, - int32_t* dst, - int ld_dst); -template void transpose_matrix( - int M, - int N, - const uint8_t* src, - int ld_src, - uint8_t* dst, - int ld_dst); -template void transpose_matrix( - int M, - int N, - const int8_t* src, - int ld_src, - int8_t* dst, - int ld_dst); - -template void transpose_matrix(float* ref, int n, int k); -template void transpose_matrix(int32_t* ref, int n, int k); -template void transpose_matrix(uint8_t* ref, int n, int k); -template void transpose_matrix(int8_t* ref, int n, int k); - } // namespace fbgemm diff --git a/test/TestUtils.h b/test/TestUtils.h index 9f4a504c15..a049149a93 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -30,26 +30,4 @@ int compare_validate_buffers( template bool check_all_zero_entries(const T* test, int m, int n); -/* - * @brief In-place transposition for nxk matrix ref. - * @param n number of rows in input (number of columns in output) - * @param k number of columns in input (number of rows in output) - */ -template -void transpose_matrix(T* ref, int n, int k); - -/* - * @brief Out-of-place transposition for M*N matrix ref. - * @param M number of rows in input - * @param K number of columns in input - */ -template -void transpose_matrix( - int M, - int N, - const T* src, - int ld_src, - T* dst, - int ld_dst); - } // namespace fbgemm