Skip to content

Commit

Permalink
Add conv_1d (pytorch#369)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#369

1. Add Conv 1D implementation. Currently, 1D still follows 2D approach, using packing and gemm ops. Only im2col is supported now.
2. Add test case in test/UniConvTest.cc
3. Add test case in bench/ConvUnifiedBenchmark.cc
4 Todo: take1DFastPath

Reviewed By: dskhudia

Differential Revision: D21460180

fbshipit-source-id: 992f7b4dc40e9878c8951b4dfd636fe7585c0a8f
  • Loading branch information
Hongzhang Shan authored and facebook-github-bot committed May 15, 2020
1 parent 46981b8 commit 17b31be
Show file tree
Hide file tree
Showing 14 changed files with 405 additions and 85 deletions.
37 changes: 32 additions & 5 deletions bench/ConvUnifiedBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ using namespace std;
using namespace fbgemm;

// clang-format off
// 1D conv shapes
vector<conv_param_t<1>> shapes_1d = {
// MB, IC, OC, IW, G, KW, stride_w, pad_w_left, pad_w_right
// regular
conv_param_t<1>(1, 600, 100, {1}, 1, {3}, {1}, {2, 2}),
conv_param_t<1>(1, 600, 100, {2}, 1, {3}, {1}, {2, 2}),
conv_param_t<1>(1, 600, 100, {3}, 1, {3}, {1}, {2, 2}),
conv_param_t<1>(1, 200, 162, {1}, 1, {3}, {1}, {2, 2}),
conv_param_t<1>(1, 600, 100, {4}, 1, {3}, {1}, {2, 2})
};

// 2D conv shapes
vector<conv_param_t<2>> shapes_2d = {
// MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
Expand Down Expand Up @@ -119,23 +130,38 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
if (SPATIAL_DIM == 3) {
header += "IT, ";
}
header += "IH, IW, G, ";
if (SPATIAL_DIM > 1) {
header += "IH, ";
}
header += "IW, G, ";
if (SPATIAL_DIM == 3) {
header += "KT, ";
}
header += "KH, KW, ";
if (SPATIAL_DIM > 1) {
header += "KH, ";
}
header += "KW, ";
if (SPATIAL_DIM == 3) {
header += "stride_t, ";
}
header += "stride_h, stride_w, ";
if (SPATIAL_DIM > 1) {
header += "stride_h, ";
}
header += "stride_w, ";
if (SPATIAL_DIM == 3) {
header += "pad_t, ";
}
header += "pad_h, pad_w, ";
if (SPATIAL_DIM > 1) {
header += "pad_h, ";
}
header += "pad_w, ";
if (SPATIAL_DIM == 3) {
header += "dilation_t, ";
}
header += "dilation_h, dilation_w, ";
if (SPATIAL_DIM > 1) {
header += "dilation_h, ";
}
header += "dilation_w, ";

header += "Type, M, N, K, ";

Expand Down Expand Up @@ -375,6 +401,7 @@ int main() {
}
#endif
// performance_test<int16_t>();
performance_test<1, int32_t>(shapes_1d);
performance_test<2, int32_t>(shapes_2d);
performance_test<3, int32_t>(shapes_3d);
return 0;
Expand Down
8 changes: 7 additions & 1 deletion include/fbgemm/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,13 @@ enum class inst_set_t { anyarch, avx2, avx512, avx512_ymm, avx512_vnni };
/**
* @brief Typed enum for optimized paths for convolutions
*/
enum class optimized_conv_t { depthwise, groupwise, pointwise, im2col };
enum class optimized_conv_t {
depthwise,
groupwise,
pointwise,
fastpath1d,
im2col
};

/**
* @brief Typed enum for implementation type.
Expand Down
3 changes: 3 additions & 0 deletions src/ExecuteKernelU8S8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);

#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 1); \
INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \
INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3);

Expand Down Expand Up @@ -449,6 +450,7 @@ INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset);
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);

#define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \
INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 1); \
INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \
INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3);

Expand Down Expand Up @@ -546,6 +548,7 @@ INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset);
memCopy<>>;

#define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \
INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 1); \
INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \
INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3);

Expand Down
8 changes: 7 additions & 1 deletion src/Fbgemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,9 @@ void fbgemmPacked(

template <int SPATIAL_DIM>
bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
static_assert(SPATIAL_DIM >= 2, "Unsupported spatial dims");

if (SPATIAL_DIM == 1) return false;

int C_per_G = conv_p.IC / conv_p.G;
int K_per_G = conv_p.OC / conv_p.G;

Expand Down Expand Up @@ -247,6 +249,7 @@ bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
std::bind(areEqual, std::placeholders::_1, 2)));
}

template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<1>& conv_p);
template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<2>& conv_p);
template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<3>& conv_p);

Expand Down Expand Up @@ -383,6 +386,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);

#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 1); \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);

Expand Down Expand Up @@ -451,6 +455,7 @@ INSTANTIATE_RELU(PackAWithQuantRowOffset);
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);

#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 1); \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);

Expand Down Expand Up @@ -588,6 +593,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
const BlockingFactors* blocking_params);

#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
INSTANTIATE_BASE(ACC_T, 1); \
INSTANTIATE_BASE(ACC_T, 2); \
INSTANTIATE_BASE(ACC_T, 3);

Expand Down
20 changes: 16 additions & 4 deletions src/FbgemmConv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0;
}

template <int SPATIAL_DIM>
bool take1DFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
return false;
}

template <int SPATIAL_DIM, typename ACC_T>
optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
Expand All @@ -57,6 +62,8 @@ optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
return optimized_conv_t::groupwise;
} else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) {
return optimized_conv_t::pointwise;
} else if (take1DFastPath<SPATIAL_DIM>(conv_p)) {
return optimized_conv_t::fastpath1d;
} else {
return optimized_conv_t::im2col;
}
Expand All @@ -73,10 +80,6 @@ int fbgemmConv(
int thread_id,
int num_threads,
const BlockingFactors* blocking_params) {
static_assert(
SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
"Only 2D and 3D convolutions are supported");

if (!packed_weights.isPackingCompliant(conv_p)) {
std::string msg =
"[FBGEMM_CONV_ERROR] Convolution parameters "
Expand Down Expand Up @@ -317,6 +320,9 @@ int fbgemmConv(
blocking_params);
break;
}
case optimized_conv_t::fastpath1d: {
break;
}
case optimized_conv_t::im2col: {
// All other convolutions go through im2col-based implementation
// std::cout << "Im2col path" << std::endl;
Expand Down Expand Up @@ -391,6 +397,7 @@ int fbgemmConv(
INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);

#define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 1); \
INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \
INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);

Expand Down Expand Up @@ -420,10 +427,15 @@ template bool takeDepthWiseFastPath<2, std::int16_t>(
template bool takeDepthWiseFastPath<3, std::int16_t>(
const conv_param_t<3>& conv_p);

template FBGEMM_API optimized_conv_t
ConvFastPath<1, std::int32_t>(const conv_param_t<1>& conv_p);
template FBGEMM_API optimized_conv_t
ConvFastPath<2, std::int32_t>(const conv_param_t<2>& conv_p);
template FBGEMM_API optimized_conv_t
ConvFastPath<3, std::int32_t>(const conv_param_t<3>& conv_p);

template FBGEMM_API optimized_conv_t
ConvFastPath<1, std::int16_t>(const conv_param_t<1>& conv_p);
template FBGEMM_API optimized_conv_t
ConvFastPath<2, std::int16_t>(const conv_param_t<2>& conv_p);
template FBGEMM_API optimized_conv_t
Expand Down
42 changes: 25 additions & 17 deletions src/GroupwiseConvAcc32Avx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1453,11 +1453,11 @@ void fbgemmGroupwiseConv(
}

int MB = conv_param.MB;
int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
int OH = conv_param.OUT_DIM[SPATIAL_DIM - 2];
int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2];
int OW = conv_param.OUT_DIM[SPATIAL_DIM - 1];
int T = SPATIAL_DIM == 2 ? 1 : conv_param.K[SPATIAL_DIM - 3];
int R = conv_param.K[SPATIAL_DIM - 2];
int T = SPATIAL_DIM <= 2 ? 1 : conv_param.K[SPATIAL_DIM - 3];
int R = SPATIAL_DIM == 1 ? 1 : conv_param.K[SPATIAL_DIM - 2];
int S = conv_param.K[SPATIAL_DIM - 1];
int G = conv_param.G;
int OC = conv_param.OC;
Expand All @@ -1466,8 +1466,8 @@ void fbgemmGroupwiseConv(
int C_per_G = conv_param.IC / G;
int OH_OW = OH * OW;
int OT_OH_OW = OT * OH * OW;
int IT = SPATIAL_DIM == 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3];
int IH = conv_param.IN_DIM[SPATIAL_DIM - 2];
int IT = SPATIAL_DIM <= 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3];
int IH = SPATIAL_DIM == 1 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 2];
int IW = conv_param.IN_DIM[SPATIAL_DIM - 1];
int IH_IW = IH * IW;
int IT_IH_IW = IT * IH * IW;
Expand All @@ -1479,6 +1479,9 @@ void fbgemmGroupwiseConv(
int G_together = PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>::
numOfGroupsTogether(conv_param);

if (SPATIAL_DIM == 1) {
throw std::runtime_error("Groupwise 1D not implemented!");
}
if (SPATIAL_DIM == 2) {
// Parallelization:
int batch_start = 0;
Expand Down Expand Up @@ -1558,10 +1561,11 @@ void fbgemmGroupwiseConv(
rowOffsetBuf_start_group);

const int32_t* inp = out_start_group;
block_type_t block{i * OT_OH_OW + oh_start * OW,
(oh_end - oh_start) * OW,
g * K_per_G,
G_together * K_per_G};
block_type_t block{
i * OT_OH_OW + oh_start * OW,
(oh_end - oh_start) * OW,
g * K_per_G,
G_together * K_per_G};
int ld_out = G * K_per_G;
int ld_in = G * K_per_G;

Expand Down Expand Up @@ -1700,10 +1704,11 @@ void fbgemmGroupwiseConv(
}

const int32_t* inp = out_start_t;
block_type_t block{i * OT_OH_OW + oh_start * OW,
(oh_end - oh_start) * OW,
g * K_per_G,
G_together * K_per_G};
block_type_t block{
i * OT_OH_OW + oh_start * OW,
(oh_end - oh_start) * OW,
g * K_per_G,
G_together * K_per_G};
int ld_out = G * K_per_G;
int ld_in = G * K_per_G;

Expand All @@ -1729,9 +1734,9 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
// row offset buffer should be a able to hold row offsets for however
// number of groups we process at a time.
if (cpuinfo_initialize()) {
int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
int bufferSize = OT * conv_param.OUT_DIM[SPATIAL_DIM - 2] *
conv_param.OUT_DIM[SPATIAL_DIM - 1];
int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2];
int bufferSize = OT * OH * conv_param.OUT_DIM[SPATIAL_DIM - 1];
if (fbgemmHasAvx512Support()) {
return conv_param.MB * bufferSize * conv_param.G;
} else if (fbgemmHasAvx2Support()) {
Expand All @@ -1746,6 +1751,8 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
}
}

template FBGEMM_API int rowOffsetBufferSizeGConv<1>(
const conv_param_t<1>& conv_param);
template FBGEMM_API int rowOffsetBufferSizeGConv<2>(
const conv_param_t<2>& conv_param);
template FBGEMM_API int rowOffsetBufferSizeGConv<3>(
Expand All @@ -1769,6 +1776,7 @@ template FBGEMM_API int rowOffsetBufferSizeGConv<3>(
INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t);

#define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \
INSTANTIATE_BIAS_T(RELU, Q_GRAN, 1); \
INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2); \
INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3);

Expand Down
Loading

0 comments on commit 17b31be

Please sign in to comment.