From 43570c86ac152572bd87efbb3b7bbce16355af53 Mon Sep 17 00:00:00 2001 From: Jiecao Yu Date: Fri, 5 Feb 2021 21:35:08 -0800 Subject: [PATCH] Back out "fix thread block size issue for cooperlake" Summary: Original commit changeset: 13d4f41de6a9 Reviewed By: jianyuh Differential Revision: D26289112 fbshipit-source-id: 88a4fae5aa74ee9fec2c8b67ddf441e472981c38 --- src/ExecuteKernelU8S8.cc | 18 +++++++----------- src/Utils.cc | 10 +--------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 56ff6cef13..7f943930a5 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -127,17 +127,6 @@ void ExecuteKernel< bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows()); bool accum = (kBlock % packedB_.blockRows()) > 0; - int jb_begin, jb_end; - fbgemmPartition1D( - th_info_.n_thread_id, - th_info_.n_num_threads, - bColBlocks, - jb_begin, - jb_end); - if (jb_end == jb_begin) { - return; - } - typename BaseType::jit_micro_kernel_fp fn; const inst_set_t isa = fbgemmInstructionSet(); @@ -214,6 +203,13 @@ void ExecuteKernel< t_start = std::chrono::high_resolution_clock::now(); #endif + int jb_begin, jb_end; + fbgemmPartition1D( + th_info_.n_thread_id, + th_info_.n_num_threads, + bColBlocks, + jb_begin, + jb_end); for (int jb = jb_begin; jb < jb_end; ++jb) { if (jb == bColBlocks - 1) { int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_; diff --git a/src/Utils.cc b/src/Utils.cc index 1074c2070d..ce64cc6603 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -419,21 +419,13 @@ int fbgemmGet2DPartition( // bm: number of rows assigned per thread block (bm = ceil(m/mb)). // bn: number of cols assigned per thread block (bn = ceil(n/nb)). // find mb and nb such that bm / bn is as close as possible to aspect_ratio. - - // for large thread numbers, we would like to reduce the aspect_ratio --- - // if the matrix is short-and-fat - // this allows us to assign more parallelism to i-dimension - if (nthreads > 16 && m/n < 0.2) { - aspect_ratio = 0.2; - } int mb = 1; int nb = nthreads / mb; int bm = (m + mb - 1) / mb; int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align; double best_delta = std::abs(static_cast(bm) / bn - aspect_ratio); for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) { - // so mb does not need to divide nthreads - if (nthreads % mb_candidate != 0 && nthreads <= 16) { + if (nthreads % mb_candidate != 0) { continue; } int nb_candidate = nthreads / mb_candidate;