Back out "fix thread block size issue for cooperlake"

Summary: Original commit changeset: 13d4f41de6a9 Reviewed By: jianyuh Differential Revision: D26289112 fbshipit-source-id: 88a4fae5aa74ee9fec2c8b67ddf441e472981c38
venkatacrc · Feb 6, 2021 · 43570c8 · 43570c8
1 parent fc29382
commit 43570c8
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 20 deletions.
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
@@ -127,17 +127,6 @@ void ExecuteKernel<
   bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
   bool accum = (kBlock % packedB_.blockRows()) > 0;
 
-  int jb_begin, jb_end;
-  fbgemmPartition1D(
-      th_info_.n_thread_id,
-      th_info_.n_num_threads,
-      bColBlocks,
-      jb_begin,
-      jb_end);
-  if (jb_end == jb_begin) {
-    return;
-  }
-
   typename BaseType::jit_micro_kernel_fp fn;
 
   const inst_set_t isa = fbgemmInstructionSet();
@@ -214,6 +203,13 @@ void ExecuteKernel<
   t_start = std::chrono::high_resolution_clock::now();
 #endif
 
+  int jb_begin, jb_end;
+  fbgemmPartition1D(
+      th_info_.n_thread_id,
+      th_info_.n_num_threads,
+      bColBlocks,
+      jb_begin,
+      jb_end);
   for (int jb = jb_begin; jb < jb_end; ++jb) {
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;

diff --git a/src/Utils.cc b/src/Utils.cc
@@ -419,21 +419,13 @@ int fbgemmGet2DPartition(
   // bm: number of rows assigned per thread block (bm = ceil(m/mb)).
   // bn: number of cols assigned per thread block (bn = ceil(n/nb)).
   // find mb and nb such that bm / bn is as close as possible to aspect_ratio.
-
-  // for large thread numbers, we would like to reduce the aspect_ratio ---
-  // if the matrix is short-and-fat
-  // this allows us to assign more parallelism to i-dimension
-  if (nthreads > 16 && m/n < 0.2) {
-    aspect_ratio = 0.2;
-  }
   int mb = 1;
   int nb = nthreads / mb;
   int bm = (m + mb - 1) / mb;
   int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align;
   double best_delta = std::abs(static_cast<double>(bm) / bn - aspect_ratio);
   for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) {
-    // so mb does not need to divide nthreads
-    if (nthreads % mb_candidate != 0 && nthreads <= 16) {
+    if (nthreads % mb_candidate != 0) {
       continue;
     }
     int nb_candidate = nthreads / mb_candidate;