Skip to content

Commit

Permalink
Back out "fix thread block size issue for cooperlake"
Browse files Browse the repository at this point in the history
Summary: Original commit changeset: 13d4f41de6a9

Reviewed By: jianyuh

Differential Revision: D26289112

fbshipit-source-id: 88a4fae5aa74ee9fec2c8b67ddf441e472981c38
  • Loading branch information
jiecaoyu authored and facebook-github-bot committed Feb 6, 2021
1 parent fc29382 commit 43570c8
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 20 deletions.
18 changes: 7 additions & 11 deletions src/ExecuteKernelU8S8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,6 @@ void ExecuteKernel<
bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
bool accum = (kBlock % packedB_.blockRows()) > 0;

int jb_begin, jb_end;
fbgemmPartition1D(
th_info_.n_thread_id,
th_info_.n_num_threads,
bColBlocks,
jb_begin,
jb_end);
if (jb_end == jb_begin) {
return;
}

typename BaseType::jit_micro_kernel_fp fn;

const inst_set_t isa = fbgemmInstructionSet();
Expand Down Expand Up @@ -214,6 +203,13 @@ void ExecuteKernel<
t_start = std::chrono::high_resolution_clock::now();
#endif

int jb_begin, jb_end;
fbgemmPartition1D(
th_info_.n_thread_id,
th_info_.n_num_threads,
bColBlocks,
jb_begin,
jb_end);
for (int jb = jb_begin; jb < jb_end; ++jb) {
if (jb == bColBlocks - 1) {
int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
Expand Down
10 changes: 1 addition & 9 deletions src/Utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -419,21 +419,13 @@ int fbgemmGet2DPartition(
// bm: number of rows assigned per thread block (bm = ceil(m/mb)).
// bn: number of cols assigned per thread block (bn = ceil(n/nb)).
// find mb and nb such that bm / bn is as close as possible to aspect_ratio.

// for large thread numbers, we would like to reduce the aspect_ratio ---
// if the matrix is short-and-fat
// this allows us to assign more parallelism to i-dimension
if (nthreads > 16 && m/n < 0.2) {
aspect_ratio = 0.2;
}
int mb = 1;
int nb = nthreads / mb;
int bm = (m + mb - 1) / mb;
int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align;
double best_delta = std::abs(static_cast<double>(bm) / bn - aspect_ratio);
for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) {
// so mb does not need to divide nthreads
if (nthreads % mb_candidate != 0 && nthreads <= 16) {
if (nthreads % mb_candidate != 0) {
continue;
}
int nb_candidate = nthreads / mb_candidate;
Expand Down

0 comments on commit 43570c8

Please sign in to comment.