From 43570c86ac152572bd87efbb3b7bbce16355af53 Mon Sep 17 00:00:00 2001
From: Jiecao Yu <jiecaoyu@fb.com>
Date: Fri, 5 Feb 2021 21:35:08 -0800
Subject: [PATCH] Back out "fix thread block size issue for cooperlake"

Summary: Original commit changeset: 13d4f41de6a9

Reviewed By: jianyuh

Differential Revision: D26289112

fbshipit-source-id: 88a4fae5aa74ee9fec2c8b67ddf441e472981c38
---
 src/ExecuteKernelU8S8.cc | 18 +++++++-----------
 src/Utils.cc             | 10 +---------
 2 files changed, 8 insertions(+), 20 deletions(-)
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 56ff6cef13..7f943930a5 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -127,17 +127,6 @@ void ExecuteKernel<
   bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
   bool accum = (kBlock % packedB_.blockRows()) > 0;
 
-  int jb_begin, jb_end;
-  fbgemmPartition1D(
-      th_info_.n_thread_id,
-      th_info_.n_num_threads,
-      bColBlocks,
-      jb_begin,
-      jb_end);
-  if (jb_end == jb_begin) {
-    return;
-  }
-
   typename BaseType::jit_micro_kernel_fp fn;
 
   const inst_set_t isa = fbgemmInstructionSet();
@@ -214,6 +203,13 @@ void ExecuteKernel<
   t_start = std::chrono::high_resolution_clock::now();
 #endif
 
+  int jb_begin, jb_end;
+  fbgemmPartition1D(
+      th_info_.n_thread_id,
+      th_info_.n_num_threads,
+      bColBlocks,
+      jb_begin,
+      jb_end);
   for (int jb = jb_begin; jb < jb_end; ++jb) {
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
diff --git a/src/Utils.cc b/src/Utils.cc
index 1074c2070d..ce64cc6603 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -419,21 +419,13 @@ int fbgemmGet2DPartition(
   // bm: number of rows assigned per thread block (bm = ceil(m/mb)).
   // bn: number of cols assigned per thread block (bn = ceil(n/nb)).
   // find mb and nb such that bm / bn is as close as possible to aspect_ratio.
-
-  // for large thread numbers, we would like to reduce the aspect_ratio ---
-  // if the matrix is short-and-fat
-  // this allows us to assign more parallelism to i-dimension
-  if (nthreads > 16 && m/n < 0.2) {
-    aspect_ratio = 0.2;
-  }
   int mb = 1;
   int nb = nthreads / mb;
   int bm = (m + mb - 1) / mb;
   int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align;
   double best_delta = std::abs(static_cast<double>(bm) / bn - aspect_ratio);
   for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) {
-    // so mb does not need to divide nthreads
-    if (nthreads % mb_candidate != 0 && nthreads <= 16) {
+    if (nthreads % mb_candidate != 0) {
       continue;
     }
     int nb_candidate = nthreads / mb_candidate;