Skip to content

Commit

Permalink
Enable the n dimensional parallelization (pytorch#170)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#170

Enabling the parallelization on the n dimension in FBGEMM. This is useful when we have the multi-thread Predictor and the target matrix shapes have large n dimension sizes. With this Diff, we can also parallelize both m and n dimensions (2D parallelization) simultaneously.

When parallelizing the n dimension, note that we have the additional overhead of allocating the duplicated packA buffer for each thread.

Reviewed By: ilia-cher

Differential Revision: D17696021

fbshipit-source-id: ec57980e033aa774935e842cf53cf1326db715c2
  • Loading branch information
jianyuh authored and facebook-github-bot committed Nov 14, 2019
1 parent b1b2f18 commit 4147a9c
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 27 deletions.
52 changes: 52 additions & 0 deletions include/fbgemm/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,58 @@ struct FBGEMM_API BlockingFactors {
int NCB;
};

/**
* @brief A struct to represent the partition information for the threads on the
* m and n dimensions.
*/
struct FBGEMM_API thread_type_t {
int g_num_threads;
int m_num_threads;
int n_num_threads;
int g_thread_id;
int m_thread_id;
int n_thread_id;

std::string toString() const {
std::string out = "";
out += "g num threads: " + std::to_string(g_num_threads) + ", ";
out += "m num threads: " + std::to_string(m_num_threads) + ", ";
out += "n num threads: " + std::to_string(n_num_threads) + ", ";
out += "g thread id: " + std::to_string(g_thread_id) + ", ";
out += "m thread id: " + std::to_string(m_thread_id) + ", ";
out += "n thread id: " + std::to_string(n_thread_id);
return out;
}
};

/**
* @brief A heuristic algorithm to partition the threads across m and n
* dimensions for parallelization, ensuring the ratio between the number of rows
* allocated to each thread in the m dimension and the number of columns
* allocated to each thread in the n dimension is approximately aspect_ratio.
*
* The less aspect_ratio is, the more favorable it is to parallelize the m
* dimension over the n dimension.
*/
FBGEMM_API int fbgemmGet2DPartition(
int m,
int n,
int nthreads,
int n_align,
double aspect_ratio);

/**
* @brief A heuristic way to partition the threads across g, m and n dimensions
* for parallelization.
*/
FBGEMM_API thread_type_t fbgemmGetThreadPartition(
int g,
int m,
int n,
int num_threads,
int thread_id,
int n_align = 64);

template <int SIZE, typename T = std::int32_t>
FBGEMM_API std::string arrayToString(const std::array<T, SIZE>& inp) {
std::string out = "[";
Expand Down
4 changes: 3 additions & 1 deletion src/ExecuteKernelU8S8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,9 @@ void ExecuteKernel<
t_start = std::chrono::high_resolution_clock::now();
#endif

for (int jb = 0; jb < bColBlocks; ++jb) {
int jb_begin, jb_end;
fbgemmPartition1D(thread_id_, num_threads_, bColBlocks, jb_begin, jb_end);
for (int jb = jb_begin; jb < jb_end; ++jb) {
if (jb == bColBlocks - 1) {
int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
if (nc != nbSize_) {
Expand Down
41 changes: 15 additions & 26 deletions src/Fbgemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ void fbgemmPacked(

int MDim = packA.numRows();
int KDimPerGroup = packB.numRows() / G;
int NDim = packB.numCols();

int kBlocks = (KDimPerGroup + KCB - 1) / KCB;

Expand All @@ -136,31 +137,19 @@ void fbgemmPacked(
t_very_start = std::chrono::high_resolution_clock::now();
#endif

thread_type_t th_info =
fbgemmGetThreadPartition(G, MDim, NDim, thread_id, num_threads);
// if (thread_id == 0)
// std::cout << ", " << th_info.toString();

int g_begin, g_end, i_begin, i_end;
if (G >= num_threads) {
// When G >= nthreads, just parallelize over G
// TODO: when G == nthreads + 1, we'll have a big load imbalance because
// only one thread will get 2 groups.
fbgemmPartition1D(thread_id, num_threads, G, g_begin, g_end);
i_begin = 0;
i_end = MDim;
} else {
// Otherwise, each group is parallelized by multiple threads.
// nthreads_per_group is floor(nthreads / G).
// If we use ceil, some groups won't be handled by any thread.
int nthreads_per_group = num_threads / G;
g_begin = std::max(std::min(thread_id / nthreads_per_group, G - 1), 0);
g_end = std::min(g_begin + 1, G);

int tid_of_g_begin = std::min(g_begin * nthreads_per_group, num_threads);
int tid_of_g_end = std::min(
(g_end == G) ? num_threads : (tid_of_g_begin + nthreads_per_group),
num_threads);
int nthreads_within_group = tid_of_g_end - tid_of_g_begin;
int tid_within_group = thread_id - tid_of_g_begin;
fbgemmPartition1DBlocked(
tid_within_group, nthreads_within_group, MDim, MR, i_begin, i_end);
}

// Calculate the begin and end index along the group dimension
fbgemmPartition1D(
th_info.g_thread_id, th_info.g_num_threads, G, g_begin, g_end);
// Calculate the begin and end index along the m dimension
fbgemmPartition1DBlocked(
th_info.m_thread_id, th_info.m_num_threads, MDim, MR, i_begin, i_end);

for (int g = g_begin; g < g_end; ++g) {
ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
Expand All @@ -171,8 +160,8 @@ void fbgemmPacked(
C_buffer,
ldc,
outProcess,
thread_id,
num_threads,
th_info.n_thread_id,
th_info.n_num_threads,
blocking_params);
for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
mc = std::min(i_end - i, MCB);
Expand Down
104 changes: 104 additions & 0 deletions src/Utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,108 @@ void* fbgemmAlignedAlloc(
return aligned_mem;
}

int fbgemmGet2DPartition(
int m,
int n,
int nthreads,
int n_align,
double aspect_ratio) {
// mb: number of thread blocks within a socket along m.
// nb: number of thread blocks along n.
// mb * nb = nthreads.
// bm: number of rows assigned per thread block (bm = ceil(m/mb)).
// bn: number of cols assigned per thread block (bn = ceil(n/nb)).
// find mb and nb such that bm / bn is as close as possible to aspect_ratio.
int mb = 1;
int nb = nthreads / mb;
int bm = (m + mb - 1) / mb;
int bn = ((n + n_align - 1) / n_align + nb - 1) / nb * n_align;
double best_delta = std::abs(static_cast<double>(bm) / bn - aspect_ratio);
for (int mb_candidate = 2; mb_candidate <= nthreads; mb_candidate++) {
if (nthreads % mb_candidate != 0) {
continue;
}
int nb_candidate = nthreads / mb_candidate;
if ((n + nb_candidate - 1) / nb_candidate <= n_align / 2) {
continue;
}
int bm_candidate = (m + mb_candidate - 1) / mb_candidate;
int bn_candidate = ((n + n_align - 1) / n_align + nb_candidate - 1) /
nb_candidate * n_align;
double delta = std::abs(
static_cast<double>(bm_candidate) / bn_candidate - aspect_ratio);
if (delta < best_delta) {
best_delta = delta;
mb = mb_candidate;
} else {
break;
}
}
return mb;
}

thread_type_t fbgemmGetThreadPartition(
int g,
int m,
int n,
int thread_id,
int num_threads,
int n_align) {
assert(num_threads >= 1);

// Fast path for the single thread case.
if (num_threads == 1) {
return thread_type_t{1, 1, 1, 0, 0, 0};
}

thread_type_t th_info;

// Heuristic for determine the thread partitions for parallelizing across g, m
// or n dimensions.
// TODO: more smart ways for thread partitions considering the
// grain size (MR, NR) parameters
if (g > num_threads) {
// TODO: when G == nthreads + 1, we'll have a big load imbalance because
// only one thread will get 2 groups.
th_info.g_num_threads = num_threads;
} else {
if (num_threads % g == 0) {
th_info.g_num_threads = g;
} else {
th_info.g_num_threads = 1;
}
}
num_threads /= th_info.g_num_threads;

// We favor the parallelization on the m dimension compared to the n
// dimension, so we set aspect_ratio to 0.5 here.
th_info.m_num_threads = fbgemmGet2DPartition(m, n, num_threads, n_align, 0.5);

assert(num_threads % (th_info.m_num_threads) == 0);
th_info.n_num_threads = num_threads / th_info.m_num_threads;

// When there are 12 threads (num_threads = 12) and g_nthreads = 2, m_nthreads
// = 2, the threads will be organized as the following 2x2x3 layout (thread is
// partitioned in the last-dim index (i.e., n, m, g, row-major for 2D) major
// order):
//
// thread 0, thread 1, thread 2 thread 6, thread 7, thread 8
// thread 3, thread 4, thread 5 thread 9, thread 10, thread 11
//
// And the corresponding (g_thread_id, m_thread_id, n_thread_id) for
// each thread is listed as the following:
//
// (0, 0, 0), (0, 0, 1), (0, 0, 2) (1, 0, 0), (1, 0, 1), (1, 0, 2)
// (0, 1, 0), (0, 1, 1), (0, 1, 2) (1, 1, 0), (1, 1, 1), (1, 1, 2)

// We can view the thread as the ternary with 3-dim base: {g,m,n}_num_threads.
th_info.n_thread_id = thread_id % th_info.n_num_threads;
thread_id /= th_info.n_num_threads;
th_info.m_thread_id = thread_id % th_info.m_num_threads;
thread_id /= th_info.m_num_threads;
th_info.g_thread_id = thread_id % th_info.g_num_threads;

return th_info;
}

} // namespace fbgemm

0 comments on commit 4147a9c

Please sign in to comment.