Enable the n dimensional parallelization (pytorch#170)

Summary: Pull Request resolved: pytorch#170 Enabling the parallelization on the n dimension in FBGEMM. This is useful when we have the multi-thread Predictor and the target matrix shapes have large n dimension sizes. With this Diff, we can also parallelize both m and n dimensions (2D parallelization) simultaneously. When parallelizing the n dimension, note that we have the additional overhead of allocating the duplicated packA buffer for each thread. Reviewed By: ilia-cher Differential Revision: D17696021 fbshipit-source-id: ec57980e033aa774935e842cf53cf1326db715c2
GazeLei · Nov 14, 2019 · 4147a9c · 4147a9c
1 parent b1b2f18
commit 4147a9c
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 27 deletions.
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -162,6 +162,58 @@ struct FBGEMM_API BlockingFactors {
   int NCB;
 };
 
+/**
+ * @brief A struct to represent the partition information for the threads on the
+ * m and n dimensions.
+ */
+struct FBGEMM_API thread_type_t {
+  int g_num_threads;
+  int m_num_threads;
+  int n_num_threads;
+  int g_thread_id;
+  int m_thread_id;
+  int n_thread_id;
+
+  std::string toString() const {
+    std::string out = "";
+    out += "g num threads: " + std::to_string(g_num_threads) + ", ";
+    out += "m num threads: " + std::to_string(m_num_threads) + ", ";
+    out += "n num threads: " + std::to_string(n_num_threads) + ", ";
+    out += "g thread id: " + std::to_string(g_thread_id) + ", ";
+    out += "m thread id: " + std::to_string(m_thread_id) + ", ";
+    out += "n thread id: " + std::to_string(n_thread_id);
+    return out;
+  }
+};
+
+/**
+ * @brief A heuristic algorithm to partition the threads across m and n
+ * dimensions for parallelization, ensuring the ratio between the number of rows
+ * allocated to each thread in the m dimension and the number of columns
+ * allocated to each thread in the n dimension is approximately aspect_ratio.
+ *
+ * The less aspect_ratio is, the more favorable it is to parallelize the m
+ * dimension over the n dimension.
+ */
+FBGEMM_API int fbgemmGet2DPartition(
+    int m,
+    int n,
+    int nthreads,
+    int n_align,
+    double aspect_ratio);
+
+/**
+ * @brief A heuristic way to partition the threads across g, m and n dimensions
+ * for parallelization.
+ */
+FBGEMM_API thread_type_t fbgemmGetThreadPartition(
+    int g,
+    int m,
+    int n,
+    int num_threads,
+    int thread_id,
+    int n_align = 64);
+
 template <int SIZE, typename T = std::int32_t>
 FBGEMM_API std::string arrayToString(const std::array<T, SIZE>& inp) {
   std::string out = "[";

diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
@@ -172,7 +172,9 @@ void ExecuteKernel<
   t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-  for (int jb = 0; jb < bColBlocks; ++jb) {
+  int jb_begin, jb_end;
+  fbgemmPartition1D(thread_id_, num_threads_, bColBlocks, jb_begin, jb_end);
+  for (int jb = jb_begin; jb < jb_end; ++jb) {
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
       if (nc != nbSize_) {

diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
@@ -118,6 +118,7 @@ void fbgemmPacked(
 
   int MDim = packA.numRows();
   int KDimPerGroup = packB.numRows() / G;
+  int NDim = packB.numCols();
 
   int kBlocks = (KDimPerGroup + KCB - 1) / KCB;
 
@@ -136,31 +137,19 @@ void fbgemmPacked(
   t_very_start = std::chrono::high_resolution_clock::now();
 #endif
 
+  thread_type_t th_info =
+      fbgemmGetThreadPartition(G, MDim, NDim, thread_id, num_threads);
+  // if (thread_id == 0)
+  //   std::cout << ", " << th_info.toString();
+
   int g_begin, g_end, i_begin, i_end;
-  if (G >= num_threads) {
-    // When G >= nthreads, just parallelize over G
-    // TODO: when G == nthreads + 1, we'll have a big load imbalance because
-    // only one thread will get 2 groups.
-    fbgemmPartition1D(thread_id, num_threads, G, g_begin, g_end);
-    i_begin = 0;
-    i_end = MDim;
-  } else {
-    // Otherwise, each group is parallelized by multiple threads.
-    // nthreads_per_group is floor(nthreads / G).
-    // If we use ceil, some groups won't be handled by any thread.
-    int nthreads_per_group = num_threads / G;
-    g_begin = std::max(std::min(thread_id / nthreads_per_group, G - 1), 0);
-    g_end = std::min(g_begin + 1, G);
-
-    int tid_of_g_begin = std::min(g_begin * nthreads_per_group, num_threads);
-    int tid_of_g_end = std::min(
-        (g_end == G) ? num_threads : (tid_of_g_begin + nthreads_per_group),
-        num_threads);
-    int nthreads_within_group = tid_of_g_end - tid_of_g_begin;
-    int tid_within_group = thread_id - tid_of_g_begin;
-    fbgemmPartition1DBlocked(
-        tid_within_group, nthreads_within_group, MDim, MR, i_begin, i_end);
-  }
+
+  // Calculate the begin and end index along the group dimension
+  fbgemmPartition1D(
+      th_info.g_thread_id, th_info.g_num_threads, G, g_begin, g_end);
+  // Calculate the begin and end index along the m dimension
+  fbgemmPartition1DBlocked(
+      th_info.m_thread_id, th_info.m_num_threads, MDim, MR, i_begin, i_end);
 
   for (int g = g_begin; g < g_end; ++g) {
     ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
@@ -171,8 +160,8 @@ void fbgemmPacked(
             C_buffer,
             ldc,
             outProcess,
-            thread_id,
-            num_threads,
+            th_info.n_thread_id,
+            th_info.n_num_threads,
             blocking_params);
     for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
       mc = std::min(i_end - i, MCB);

diff --git a/src/Utils.cc b/src/Utils.cc
@@ -259,4 +259,108 @@ void* fbgemmAlignedAlloc(
   return aligned_mem;
 }
 
+int fbgemmGet2DPartition(
+    int m,
+    int n,
+    int nthreads,
+    int n_align,
+    double aspect_ratio) {
+  // mb: number of thread blocks within a socket along m.
+  // nb: number of thread blocks along n.
+  // mb * nb = nthreads.
+  // bm: number of rows assigned per thread block (bm = ceil(m/mb)).
+  // bn: number of cols assigned per thread block (bn = ceil(n/nb)).
+  // find mb and nb such that bm / bn is as close as possible to aspect_ratio.
+  int mb = 1;
+  int nb = nthreads / mb;
+  int bm = (m + mb - 1) / mb;
+  int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align;
+  double best_delta = std::abs(static_cast<double>(bm) / bn - aspect_ratio);
+  for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) {
+    if (nthreads % mb_candidate != 0) {
+      continue;
+    }
+    int nb_candidate = nthreads / mb_candidate;
+    if ((n + nb_candidate - 1) / nb_candidate <= n_align / 2) {
+      continue;
+    }
+    int bm_candidate = (m + mb_candidate - 1) / mb_candidate;
+    int bn_candidate = ((n + n_align - 1) / n_align + nb_candidate - 1) /
+        nb_candidate * n_align;
+    double delta = std::abs(
+        static_cast<double>(bm_candidate) / bn_candidate - aspect_ratio);
+    if (delta < best_delta) {
+      best_delta = delta;
+      mb = mb_candidate;
+    } else {
+      break;
+    }
+  }
+  return mb;
+}
+
+thread_type_t fbgemmGetThreadPartition(
+    int g,
+    int m,
+    int n,
+    int thread_id,
+    int num_threads,
+    int n_align) {
+  assert(num_threads >= 1);
+
+  // Fast path for the single thread case.
+  if (num_threads == 1) {
+    return thread_type_t{1, 1, 1, 0, 0, 0};
+  }
+
+  thread_type_t th_info;
+
+  // Heuristic for determine the thread partitions for parallelizing across g, m
+  // or n dimensions.
+  // TODO: more smart ways for thread partitions considering the
+  // grain size (MR, NR) parameters
+  if (g > num_threads) {
+    // TODO: when G == nthreads + 1, we'll have a big load imbalance because
+    // only one thread will get 2 groups.
+    th_info.g_num_threads = num_threads;
+  } else {
+    if (num_threads % g == 0) {
+      th_info.g_num_threads = g;
+    } else {
+      th_info.g_num_threads = 1;
+    }
+  }
+  num_threads /= th_info.g_num_threads;
+
+  // We favor the parallelization on the m dimension compared to the n
+  // dimension, so we set aspect_ratio to 0.5 here.
+  th_info.m_num_threads = fbgemmGet2DPartition(m, n, num_threads, n_align, 0.5);
+
+  assert(num_threads % (th_info.m_num_threads) == 0);
+  th_info.n_num_threads = num_threads / th_info.m_num_threads;
+
+  // When there are 12 threads (num_threads = 12) and g_nthreads = 2, m_nthreads
+  // = 2, the threads will be organized as the following 2x2x3 layout (thread is
+  // partitioned in the last-dim index (i.e., n, m, g, row-major for 2D) major
+  // order):
+  //
+  // thread 0, thread 1, thread 2      thread 6, thread 7,  thread 8
+  // thread 3, thread 4, thread 5      thread 9, thread 10, thread 11
+  //
+  // And the corresponding (g_thread_id, m_thread_id, n_thread_id) for
+  // each thread is listed as the following:
+  //
+  // (0, 0, 0), (0, 0, 1), (0, 0, 2)            (1, 0, 0), (1, 0, 1), (1, 0, 2)
+  // (0, 1, 0), (0, 1, 1), (0, 1, 2)            (1, 1, 0), (1, 1, 1), (1, 1, 2)
+
+  // We can view the thread as the ternary with 3-dim base: {g,m,n}_num_threads.
+  th_info.n_thread_id = thread_id % th_info.n_num_threads;
+  thread_id /= th_info.n_num_threads;
+  th_info.m_thread_id = thread_id % th_info.m_num_threads;
+  thread_id /= th_info.m_num_threads;
+  th_info.g_thread_id = thread_id % th_info.g_num_threads;
+
+  return th_info;
+}
+
 } // namespace fbgemm