diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc index 21d6191946..338eee8e92 100644 --- a/bench/ConvUnifiedBenchmark.cc +++ b/bench/ConvUnifiedBenchmark.cc @@ -25,6 +25,17 @@ using namespace std; using namespace fbgemm; // clang-format off +// 1D conv shapes +vector> shapes_1d = { + // MB, IC, OC, IW, G, KW, stride_w, pad_w_left, pad_w_right + // regular + conv_param_t<1>(1, 600, 100, {1}, 1, {3}, {1}, {2, 2}), + conv_param_t<1>(1, 600, 100, {2}, 1, {3}, {1}, {2, 2}), + conv_param_t<1>(1, 600, 100, {3}, 1, {3}, {1}, {2, 2}), + conv_param_t<1>(1, 200, 162, {1}, 1, {3}, {1}, {2, 2}), + conv_param_t<1>(1, 600, 100, {4}, 1, {3}, {1}, {2, 2}) +}; + // 2D conv shapes vector> shapes_2d = { // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, @@ -119,23 +130,38 @@ void performance_test(const vector>& shapes) { if (SPATIAL_DIM == 3) { header += "IT, "; } - header += "IH, IW, G, "; + if (SPATIAL_DIM > 1) { + header += "IH, "; + } + header += "IW, G, "; if (SPATIAL_DIM == 3) { header += "KT, "; } - header += "KH, KW, "; + if (SPATIAL_DIM > 1) { + header += "KH, "; + } + header += "KW, "; if (SPATIAL_DIM == 3) { header += "stride_t, "; } - header += "stride_h, stride_w, "; + if (SPATIAL_DIM > 1) { + header += "stride_h, "; + } + header += "stride_w, "; if (SPATIAL_DIM == 3) { header += "pad_t, "; } - header += "pad_h, pad_w, "; + if (SPATIAL_DIM > 1) { + header += "pad_h, "; + } + header += "pad_w, "; if (SPATIAL_DIM == 3) { header += "dilation_t, "; } - header += "dilation_h, dilation_w, "; + if (SPATIAL_DIM > 1) { + header += "dilation_h, "; + } + header += "dilation_w, "; header += "Type, M, N, K, "; @@ -375,6 +401,7 @@ int main() { } #endif // performance_test(); + performance_test<1, int32_t>(shapes_1d); performance_test<2, int32_t>(shapes_2d); performance_test<3, int32_t>(shapes_3d); return 0; diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 8e8020745c..56a0e6b822 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -46,7 +46,13 @@ enum class inst_set_t { anyarch, avx2, avx512, avx512_ymm, avx512_vnni }; /** * @brief Typed enum for optimized paths for convolutions */ -enum class optimized_conv_t { depthwise, groupwise, pointwise, im2col }; +enum class optimized_conv_t { + depthwise, + groupwise, + pointwise, + fastpath1d, + im2col +}; /** * @brief Typed enum for implementation type. diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 924fa33f02..dd35027af0 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -388,6 +388,7 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset); ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 1); \ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3); @@ -449,6 +450,7 @@ INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset); ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 1); \ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3); @@ -546,6 +548,7 @@ INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset); memCopy<>>; #define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 1); \ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3); diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 763eacc4cf..7c10157a5e 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -206,7 +206,9 @@ void fbgemmPacked( template bool fbgemmOptimizedGConv(const conv_param_t& conv_p) { - static_assert(SPATIAL_DIM >= 2, "Unsupported spatial dims"); + + if (SPATIAL_DIM == 1) return false; + int C_per_G = conv_p.IC / conv_p.G; int K_per_G = conv_p.OC / conv_p.G; @@ -247,6 +249,7 @@ bool fbgemmOptimizedGConv(const conv_param_t& conv_p) { std::bind(areEqual, std::placeholders::_1, 2))); } +template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<1>& conv_p); template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<2>& conv_p); template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<3>& conv_p); @@ -383,6 +386,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset); ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 1); \ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); @@ -451,6 +455,7 @@ INSTANTIATE_RELU(PackAWithQuantRowOffset); ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 1); \ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); @@ -588,6 +593,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset); const BlockingFactors* blocking_params); #define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 1); \ INSTANTIATE_BASE(ACC_T, 2); \ INSTANTIATE_BASE(ACC_T, 3); diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index f78c326929..935b38d35e 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -49,6 +49,11 @@ bool takePointWiseFastPath(const conv_param_t& conv_p) { std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0; } +template +bool take1DFastPath(const conv_param_t& conv_p) { + return false; +} + template optimized_conv_t ConvFastPath(const conv_param_t& conv_p) { if (takeDepthWiseFastPath(conv_p)) { @@ -57,6 +62,8 @@ optimized_conv_t ConvFastPath(const conv_param_t& conv_p) { return optimized_conv_t::groupwise; } else if (takePointWiseFastPath(conv_p)) { return optimized_conv_t::pointwise; + } else if (take1DFastPath(conv_p)) { + return optimized_conv_t::fastpath1d; } else { return optimized_conv_t::im2col; } @@ -73,10 +80,6 @@ int fbgemmConv( int thread_id, int num_threads, const BlockingFactors* blocking_params) { - static_assert( - SPATIAL_DIM == 2 || SPATIAL_DIM == 3, - "Only 2D and 3D convolutions are supported"); - if (!packed_weights.isPackingCompliant(conv_p)) { std::string msg = "[FBGEMM_CONV_ERROR] Convolution parameters " @@ -317,6 +320,9 @@ int fbgemmConv( blocking_params); break; } + case optimized_conv_t::fastpath1d: { + break; + } case optimized_conv_t::im2col: { // All other convolutions go through im2col-based implementation // std::cout << "Im2col path" << std::endl; @@ -391,6 +397,7 @@ int fbgemmConv( INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t); #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \ + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 1); \ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3); @@ -420,10 +427,15 @@ template bool takeDepthWiseFastPath<2, std::int16_t>( template bool takeDepthWiseFastPath<3, std::int16_t>( const conv_param_t<3>& conv_p); +template FBGEMM_API optimized_conv_t +ConvFastPath<1, std::int32_t>(const conv_param_t<1>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<2, std::int32_t>(const conv_param_t<2>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<3, std::int32_t>(const conv_param_t<3>& conv_p); + +template FBGEMM_API optimized_conv_t +ConvFastPath<1, std::int16_t>(const conv_param_t<1>& conv_p); template FBGEMM_API optimized_conv_t ConvFastPath<2, std::int16_t>(const conv_param_t<2>& conv_p); template FBGEMM_API optimized_conv_t diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 16ee1c6123..274f769f40 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -1453,11 +1453,11 @@ void fbgemmGroupwiseConv( } int MB = conv_param.MB; - int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3]; - int OH = conv_param.OUT_DIM[SPATIAL_DIM - 2]; + int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3]; + int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2]; int OW = conv_param.OUT_DIM[SPATIAL_DIM - 1]; - int T = SPATIAL_DIM == 2 ? 1 : conv_param.K[SPATIAL_DIM - 3]; - int R = conv_param.K[SPATIAL_DIM - 2]; + int T = SPATIAL_DIM <= 2 ? 1 : conv_param.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_param.K[SPATIAL_DIM - 2]; int S = conv_param.K[SPATIAL_DIM - 1]; int G = conv_param.G; int OC = conv_param.OC; @@ -1466,8 +1466,8 @@ void fbgemmGroupwiseConv( int C_per_G = conv_param.IC / G; int OH_OW = OH * OW; int OT_OH_OW = OT * OH * OW; - int IT = SPATIAL_DIM == 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3]; - int IH = conv_param.IN_DIM[SPATIAL_DIM - 2]; + int IT = SPATIAL_DIM <= 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3]; + int IH = SPATIAL_DIM == 1 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 2]; int IW = conv_param.IN_DIM[SPATIAL_DIM - 1]; int IH_IW = IH * IW; int IT_IH_IW = IT * IH * IW; @@ -1479,6 +1479,9 @@ void fbgemmGroupwiseConv( int G_together = PackWeightMatrixForGConv:: numOfGroupsTogether(conv_param); + if (SPATIAL_DIM == 1) { + throw std::runtime_error("Groupwise 1D not implemented!"); + } if (SPATIAL_DIM == 2) { // Parallelization: int batch_start = 0; @@ -1558,10 +1561,11 @@ void fbgemmGroupwiseConv( rowOffsetBuf_start_group); const int32_t* inp = out_start_group; - block_type_t block{i * OT_OH_OW + oh_start * OW, - (oh_end - oh_start) * OW, - g * K_per_G, - G_together * K_per_G}; + block_type_t block{ + i * OT_OH_OW + oh_start * OW, + (oh_end - oh_start) * OW, + g * K_per_G, + G_together * K_per_G}; int ld_out = G * K_per_G; int ld_in = G * K_per_G; @@ -1700,10 +1704,11 @@ void fbgemmGroupwiseConv( } const int32_t* inp = out_start_t; - block_type_t block{i * OT_OH_OW + oh_start * OW, - (oh_end - oh_start) * OW, - g * K_per_G, - G_together * K_per_G}; + block_type_t block{ + i * OT_OH_OW + oh_start * OW, + (oh_end - oh_start) * OW, + g * K_per_G, + G_together * K_per_G}; int ld_out = G * K_per_G; int ld_in = G * K_per_G; @@ -1729,9 +1734,9 @@ int rowOffsetBufferSizeGConv(const conv_param_t& conv_param) { // row offset buffer should be a able to hold row offsets for however // number of groups we process at a time. if (cpuinfo_initialize()) { - int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3]; - int bufferSize = OT * conv_param.OUT_DIM[SPATIAL_DIM - 2] * - conv_param.OUT_DIM[SPATIAL_DIM - 1]; + int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3]; + int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2]; + int bufferSize = OT * OH * conv_param.OUT_DIM[SPATIAL_DIM - 1]; if (fbgemmHasAvx512Support()) { return conv_param.MB * bufferSize * conv_param.G; } else if (fbgemmHasAvx2Support()) { @@ -1746,6 +1751,8 @@ int rowOffsetBufferSizeGConv(const conv_param_t& conv_param) { } } +template FBGEMM_API int rowOffsetBufferSizeGConv<1>( + const conv_param_t<1>& conv_param); template FBGEMM_API int rowOffsetBufferSizeGConv<2>( const conv_param_t<2>& conv_param); template FBGEMM_API int rowOffsetBufferSizeGConv<3>( @@ -1769,6 +1776,7 @@ template FBGEMM_API int rowOffsetBufferSizeGConv<3>( INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t); #define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \ + INSTANTIATE_BIAS_T(RELU, Q_GRAN, 1); \ INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2); \ INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3); diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index 524a216716..0af67d2c06 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -45,8 +45,6 @@ PackAWithIm2Col::PackAWithIm2Col( conv_p_(conv_p), sdata_(sdata), a_zero_pt_(a_zero_pt) { - static_assert( - SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } @@ -203,11 +201,12 @@ void pack_a_with_im2col_opt( template void PackAWithIm2Col::pack(const block_type_t& block) { - block_type_t block_p = {block.row_start, - block.row_size, - block.col_start, - (block.col_size + row_interleave_B_ - 1) / - row_interleave_B_ * row_interleave_B_}; + block_type_t block_p = { + block.row_start, + block.row_size, + block.col_start, + (block.col_size + row_interleave_B_ - 1) / row_interleave_B_ * + row_interleave_B_}; BaseType::packedBlock(block_p); T* out = BaseType::getBuf(); // accumulate into row offset? @@ -307,7 +306,46 @@ void PackAWithIm2Col::pack(const block_type_t& block) { } for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - if (SPATIAL_DIM == 2) { // static if + if (SPATIAL_DIM == 1) { // static if + int n = i / (conv_p_.OUT_DIM[0]); + int w = i % (conv_p_.OUT_DIM[0]); + for (int j = block.col_start; + j < block.col_start + block.col_size + ic_per_group - 1; + j += ic_per_group) { + int j_blk_id = j / ic_per_group; + // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC ) + int j_blk_start = std::max(j_blk_id * ic_per_group, block.col_start); + int j_blk_end = std::min( + (j_blk_id + 1) * ic_per_group, block.col_start + block.col_size); + if (j_blk_start >= j_blk_end) { + break; + } + + int grs = j / ic_per_group; + int s = grs % conv_p_.K[0]; + int g = grs / conv_p_.K[0]; + + int w_in = + -conv_p_.pad[0] + w * conv_p_.stride[0] + s * conv_p_.dilation[0]; + if (w_in < 0 || w_in >= conv_p_.IN_DIM[0]) { + // Please note that padding for convolution should be filled with + // zero_pt + std::memset( + out + (i - block.row_start) * BaseType::blockColSize() + + (j_blk_start - block.col_start), + a_zero_pt_, + sizeof(T) * (j_blk_end - j_blk_start)); + } else { + std::memcpy( + out + (i - block.row_start) * BaseType::blockColSize() + + j_blk_start - block.col_start, + sdata_ + (n * conv_p_.IN_DIM[0] + w_in) * conv_p_.IC + + g * ic_per_group + (j_blk_start % ic_per_group), + sizeof(T) * (j_blk_end - j_blk_start)); + } + } + + } else if (SPATIAL_DIM == 2) { // static if int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); int w = hw % conv_p_.OUT_DIM[1]; @@ -485,8 +523,10 @@ int PackAWithIm2Col::rowOffsetBufferSize( } } -template class PackAWithIm2Col; -template class PackAWithIm2Col; +template class PackAWithIm2Col; +template class PackAWithIm2Col; +template class PackAWithIm2Col; +template class PackAWithIm2Col; template class PackAWithIm2Col; template class PackAWithIm2Col; diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index c271c4c3d5..4277ed4b83 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -334,10 +334,11 @@ template void PackBMatrix::unpack( T* origin_buf, const BlockingFactors* params) { - block_type_t blockB{BaseType::packedRowStart(), - BaseType::numPackedRows(), - BaseType::packedColStart(), - BaseType::numPackedCols()}; + block_type_t blockB{ + BaseType::packedRowStart(), + BaseType::numPackedRows(), + BaseType::packedColStart(), + BaseType::numPackedCols()}; pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false, params); } diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index d1896d8bed..9fa9e6f723 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -85,6 +85,10 @@ template class PackMatrix< uint8_t, int32_t>; +template class PackMatrix< + PackAWithIm2Col, + uint8_t, + int32_t>; template class PackMatrix, uint8_t, int32_t>; template class PackMatrix< PackAWithIm2Col, @@ -99,6 +103,10 @@ template class PackMatrix< template class PackMatrix, int8_t, int32_t>; // int16 accumulation +template class PackMatrix< + PackAWithIm2Col, + uint8_t, + int16_t>; template class PackMatrix, uint8_t, int16_t>; template class PackMatrix< PackAWithIm2Col, diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc index bb09023e91..58aa7199ae 100644 --- a/src/PackWeightMatrixForGConv.cc +++ b/src/PackWeightMatrixForGConv.cc @@ -86,8 +86,8 @@ inline int PackWeightMatrixForGConv::unpacked_index_( bool tr) { // Get the full dimensions // Can't use T as varname because T is a template parameter. - int F = SPATIAL_DIM == 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; - int R = conv_param_.K[SPATIAL_DIM - 2]; + int F = SPATIAL_DIM <= 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_param_.K[SPATIAL_DIM - 2]; int S = conv_param_.K[SPATIAL_DIM - 1]; int G = conv_param_.G; int IC_per_G = conv_param_.IC / G; @@ -118,8 +118,8 @@ inline int PackWeightMatrixForGConv::packed_index_( int c) { // Get the full dimensions // Can't use T as varname because T is a template parameter. - int F = SPATIAL_DIM == 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; - int R = conv_param_.K[SPATIAL_DIM - 2]; + int F = SPATIAL_DIM <= 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_param_.K[SPATIAL_DIM - 2]; int S = conv_param_.K[SPATIAL_DIM - 1]; int G = conv_param_.G; int IC_per_G = conv_param_.IC / G; @@ -159,8 +159,8 @@ void PackWeightMatrixForGConv::pack_unpack_( T* dst, bool ispack) { // Can't use T as varname because T is a template parameter. - int F = SPATIAL_DIM == 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; - int R = conv_param_.K[SPATIAL_DIM - 2]; + int F = SPATIAL_DIM <= 2 ? 1 : conv_param_.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_param_.K[SPATIAL_DIM - 2]; int S = conv_param_.K[SPATIAL_DIM - 1]; int G = conv_param_.G; int IC_per_G = conv_param_.IC / G; @@ -257,6 +257,8 @@ void PackWeightMatrixForGConv::unpack(T* origin_buf) { pack_unpack_(const_cast(pdata_), origin_buf, false); } +template class FBGEMM_API PackWeightMatrixForGConv; +template class FBGEMM_API PackWeightMatrixForGConv; template class FBGEMM_API PackWeightMatrixForGConv; template class FBGEMM_API PackWeightMatrixForGConv; template class FBGEMM_API PackWeightMatrixForGConv; diff --git a/src/PackWeightsForConv.cc b/src/PackWeightsForConv.cc index 3d673e9c63..0d830ddd3c 100644 --- a/src/PackWeightsForConv.cc +++ b/src/PackWeightsForConv.cc @@ -18,15 +18,12 @@ PackWeightsForConv::PackWeightsForConv( const T* sdata, const BlockingFactors* blocking_params) : conv_param_(conv_p) { - static_assert( - SPATIAL_DIM == 2 || SPATIAL_DIM == 3, - "Only 2D and 3D convolutions are supported"); // Note: The following logic should *exactly* match with what we have in // FbgemmConv.cc switch (ConvFastPath(conv_p)) { case optimized_conv_t::depthwise: { - const int kernel_d = SPATIAL_DIM == 2 ? 1 : conv_p.K[0]; - const int kernel_h = conv_p.K[SPATIAL_DIM - 2]; + const int kernel_d = SPATIAL_DIM <= 2 ? 1 : conv_p.K[0]; + const int kernel_h = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; const int kernel_w = conv_p.K[SPATIAL_DIM - 1]; W_dw_packed_ = std::make_shared( conv_p.OC, kernel_d * kernel_h * kernel_w, sdata); @@ -40,8 +37,8 @@ PackWeightsForConv::PackWeightsForConv( } case optimized_conv_t::pointwise: { const int N = conv_p.OC / conv_p.G; - const int kernel_d = SPATIAL_DIM == 2 ? 1 : conv_p.K[0]; - const int kernel_h = conv_p.K[SPATIAL_DIM - 2]; + const int kernel_d = SPATIAL_DIM <= 2 ? 1 : conv_p.K[0]; + const int kernel_h = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; const int kernel_w = conv_p.K[SPATIAL_DIM - 1]; const int K = kernel_d * kernel_h * kernel_w * conv_p.IC; W_pointwise_packed_ = std::make_shared>( @@ -55,10 +52,13 @@ PackWeightsForConv::PackWeightsForConv( blocking_params); break; } + case optimized_conv_t::fastpath1d: { + break; + } case optimized_conv_t::im2col: { const int N = conv_p.OC / conv_p.G; - const int kernel_d = SPATIAL_DIM == 2 ? 1 : conv_p.K[0]; - const int kernel_h = conv_p.K[SPATIAL_DIM - 2]; + const int kernel_d = SPATIAL_DIM <= 2 ? 1 : conv_p.K[0]; + const int kernel_h = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; const int kernel_w = conv_p.K[SPATIAL_DIM - 1]; const int K = kernel_d * kernel_h * kernel_w * conv_p.IC; W_im2col_packed_ = std::make_shared>( @@ -181,6 +181,7 @@ std::string PackWeightsForConv::mismatchingParams( return msg; } +template class PackWeightsForConv<1, int8_t, int32_t>; template class PackWeightsForConv<2, int8_t, int32_t>; template class PackWeightsForConv<3, int8_t, int32_t>; diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 991c752b31..0e7e3c8d0e 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -351,6 +351,50 @@ int32_t clip_16bit(int32_t x) { } } +/* Imitate the Im2Col function + * from caffe2/utils/math_cpu.cc + * NWC StorageOrder/Layout + * A: NWC: NW_0 x C_0 + * Ao: NWC: NW_1 x G RS C_0/G + */ +template <> +FBGEMM_API void im2col_ref( + const conv_param_t<1>& conv_p, + const uint8_t* A, + int32_t A_zero_point, + uint8_t* Ao) { + int IC = conv_p.IC; + int G = conv_p.G; + assert(IC % G == 0); + array IN_DIM = conv_p.IN_DIM; + array OUT_DIM = conv_p.OUT_DIM; + array K = conv_p.K; + + for (int n = 0; n < conv_p.MB; ++n) { + for (int w = 0; w < OUT_DIM[0]; ++w) { + for (int s = 0; s < K[0]; ++s) { + int w_in = + -conv_p.pad[0] + w * conv_p.stride[0] + s * conv_p.dilation[0]; + if (w_in < 0 || w_in >= IN_DIM[0]) { + for (int g = 0; g < G; ++g) { + memset( + Ao + (((n * OUT_DIM[0] + w) * G + g) * K[0] + s) * (IC / G), + A_zero_point, + sizeof(uint8_t) * (IC / G)); + } + } else { + for (int g = 0; g < G; ++g) { + memcpy( + Ao + (((n * OUT_DIM[0] + w) * G + g) * K[0] + s) * (IC / G), + A + (n * IN_DIM[0] + w_in) * IC + g * (IC / G), + sizeof(uint8_t) * (IC / G)); + } + } + } // for each s + } // for each w + } // for each n +} + /* Imitate the Im2Col function * from caffe2/utils/math_cpu.cc * NHWC StorageOrder/Layout @@ -501,6 +545,51 @@ FBGEMM_API void im2col_ref( } // for each n } +// 1D Conv +template <> +FBGEMM_API void conv_ref( + const conv_param_t<1>& conv_p, + const uint8_t* A, + int32_t A_zero_point, + const int8_t* B, + int32_t* C) { + // A is assumed to be (N Lin Cin) + // B is assumed to be (G K Cin/G Cout/G) + // C is assumed to be (N Lout Cout) + int IC = conv_p.IC; + int OC = conv_p.OC; + int G = conv_p.G; + assert(IC % G == 0); + assert(OC % G == 0); + array IN_DIM = conv_p.IN_DIM; + array OUT_DIM = conv_p.OUT_DIM; + array K = conv_p.K; + + for (int n = 0; n < conv_p.MB; ++n) { + for (int w = 0; w < OUT_DIM[0]; ++w) { + for (int g = 0; g < G; ++g) { + for (int m = 0; m < OC / G; ++m) { + int sum = 0; + for (int r = 0; r < K[0]; ++r) { + int w_in = + -conv_p.pad[0] + w * conv_p.stride[0] + r * conv_p.dilation[0]; + for (int c = 0; c < IC / G; ++c) { + int a = w_in < 0 || w_in >= IN_DIM[0] + ? A_zero_point + : A[(n * IN_DIM[0] + w_in) * IC + g * (IC / G) + c]; + int b = + B[((g * K[0] + r) * (IC / G) + c) * (OC / G) + + m]; // G K (Cin / G) (Cout / G) after transpose + sum += a * b; + } // for each c + } // for each r + C[(n * OUT_DIM[0] + w) * OC + g * (OC / G) + m] = sum; + } // for each w + } // for each m + } // for each group + } // for each n +} + // 2D Conv template <> FBGEMM_API void conv_ref( @@ -628,9 +717,6 @@ void transposeConvWeights( int IC_per_G = conv_p.IC / conv_p.G; int OC_per_G = conv_p.OC / conv_p.G; - assert( - (SPATIAL_DIM == 3 || SPATIAL_DIM == 2) && - "Only 2D and 3D convolutions are supported"); int filter_prod = std::accumulate( conv_p.K.begin(), conv_p.K.begin() + SPATIAL_DIM, @@ -1192,6 +1278,11 @@ int rowwise_sparse_adagrad_fused_ref( return current == index_size; } +template FBGEMM_API void transposeConvWeights( + const conv_param_t<1>& conv_p, + const std::int8_t* src, + std::int8_t* dest); + template FBGEMM_API void transposeConvWeights( const conv_param_t<2>& conv_p, const std::int8_t* src, diff --git a/test/GConvTest.cc b/test/GConvTest.cc index 737f652416..ae22f108f8 100644 --- a/test/GConvTest.cc +++ b/test/GConvTest.cc @@ -25,8 +25,9 @@ using namespace std; using namespace fbgemm; -vector transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; +vector transposeVals{ + matrix_op_t::NoTranspose, + matrix_op_t::Transpose}; vector qGranularityVals{ QuantizationGranularity::TENSOR, @@ -271,16 +272,16 @@ void runRequantizeTest(matrix_op_t /* unused */, bool a_symmetric, bool b_symmetric) { vector> shapes(GetShapes_()); for (auto conv_p : shapes) { - int T = SPATIAL_DIM == 2 ? 1 : conv_p.K[SPATIAL_DIM - 3]; - int R = conv_p.K[SPATIAL_DIM - 2]; + int T = SPATIAL_DIM <= 2 ? 1 : conv_p.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; int S = conv_p.K[SPATIAL_DIM - 1]; int G = conv_p.G; int OC = conv_p.OC; - int IT = SPATIAL_DIM == 2 ? 1 : conv_p.IN_DIM[SPATIAL_DIM - 3]; - int IH = conv_p.IN_DIM[SPATIAL_DIM - 2]; + int IT = SPATIAL_DIM <= 2 ? 1 : conv_p.IN_DIM[SPATIAL_DIM - 3]; + int IH = SPATIAL_DIM == 1 ? 1 : conv_p.IN_DIM[SPATIAL_DIM - 2]; int IW = conv_p.IN_DIM[SPATIAL_DIM - 1]; - int OT = SPATIAL_DIM == 2 ? 1 : conv_p.OUT_DIM[SPATIAL_DIM - 3]; - int OH = conv_p.OUT_DIM[SPATIAL_DIM - 2]; + int OT = SPATIAL_DIM <= 2 ? 1 : conv_p.OUT_DIM[SPATIAL_DIM - 3]; + int OH = SPATIAL_DIM == 1 ? 1 : conv_p.OUT_DIM[SPATIAL_DIM - 2]; int OW = conv_p.OUT_DIM[SPATIAL_DIM - 1]; int IC_per_G = conv_p.IC / conv_p.G; int OC_per_G = conv_p.OC / conv_p.G; @@ -591,8 +592,8 @@ void runPackUnpackTest(matrix_op_t btrans) { vector> shapes(GetShapes_()); for (auto conv_p : shapes) { - int T = SPATIAL_DIM == 2 ? 1 : conv_p.K[SPATIAL_DIM - 3]; - int R = conv_p.K[SPATIAL_DIM - 2]; + int T = SPATIAL_DIM <= 2 ? 1 : conv_p.K[SPATIAL_DIM - 3]; + int R = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; int S = conv_p.K[SPATIAL_DIM - 1]; int IC_per_G = conv_p.IC / conv_p.G; int OC_per_G = conv_p.OC / conv_p.G; diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc index bfc3e516b9..79348e74ae 100644 --- a/test/UniConvTest.cc +++ b/test/UniConvTest.cc @@ -26,7 +26,24 @@ vector qGranularityVals{ QuantizationGranularity::OUT_CHANNEL}; // clang-format off -static vector> GetShapes_() { +template +static typename std::enable_if>>::type +GetShapes_() { + vector> shapes = { + // MB, IC, OC, {IW}, G, {KW}, {stride_w}, {pad_l,pad_r}, {dilation_w} + // Regular + conv_param_t<1>(1, 16, 16, {30}, 1, {3}, {1}, {1, 1}), + conv_param_t<1>(1, 32, 32, {30}, 1, {3}, {1}, {1, 1}), + conv_param_t<1>(1, 32, 16, {30}, 1, {3}, {1}, {0, 0}, {2}), + }; + return shapes; +} +// clang-format on + +// clang-format off +template +static typename std::enable_if>>::type +GetShapes_() { vector> shapes = { // MB, IC, OC, {IH, IW}, G, {KH, KW}, {stride_h, stride_w}, {pad_t, pad_l, // pad_b, pad_r}, {dilation_h, dilation_w} @@ -130,6 +147,64 @@ TEST_P(uniConvTest, packingTest) { int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad; tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam(); + conv_param_t<1> conv_p_1d( + MB, IC, OC, {IW}, G, {kernel}, {stride}, {pad, pad}); + + int kernel_dim_1d = kernel; + aligned_vector Bint8_1d( + kernel_dim_1d * conv_p_1d.IC * (conv_p_1d.OC / conv_p_1d.G)); + PackWeightsForConv<1> packedB_1D(conv_p_1d, Bint8_1d.data()); + + switch (ConvFastPath<1, int32_t>(conv_p_1d)) { + case optimized_conv_t::depthwise: { + ASSERT_EQ(packedB_1D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_1D.getPackedWForDepthwise(), nullptr) + << "depthwise packed matrix is null"; + break; + } + case optimized_conv_t::groupwise: { + ASSERT_EQ(packedB_1D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForDepthwise(), nullptr) + << "depthwise packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_1D.getPackedWForGroupwise(), nullptr) + << "Groupwise packed matrix is null"; + break; + } + case optimized_conv_t::pointwise: { + ASSERT_EQ(packedB_1D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForDepthwise(), nullptr) + << "depthwise packed matrix should null"; + ASSERT_EQ(packedB_1D.getPackedWForGroupwise(), nullptr) + << "Groupwise packed matrix should be null"; + ASSERT_NE(packedB_1D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix is null"; + break; + } + case optimized_conv_t::fastpath1d: { + break; + } + case optimized_conv_t::im2col: { + ASSERT_EQ(packedB_1D.getPackedWForDepthwise(), nullptr) + << "depthwise packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_1D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_1D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix is null"; + break; + } + } + conv_param_t<2> conv_p_2d( MB, IC, @@ -179,6 +254,9 @@ TEST_P(uniConvTest, packingTest) { << "pointwise packed matrix is null"; break; } + case optimized_conv_t::fastpath1d: { + break; + } case optimized_conv_t::im2col: { ASSERT_EQ(packedB_2D.getPackedWForDepthwise(), nullptr) << "depthwise packed matrix should be null"; @@ -241,6 +319,9 @@ TEST_P(uniConvTest, packingTest) { << "pointwise packed matrix is null"; break; } + case optimized_conv_t::fastpath1d: { + break; + } case optimized_conv_t::im2col: { ASSERT_EQ(packedB_3D.getPackedWForDepthwise(), nullptr) << "depthwise packed matrix should be null"; @@ -262,6 +343,23 @@ TEST_P(uniConvTest, packUnpackTest) { int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad; tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam(); + conv_param_t<1> conv_p_1d( + MB, IC, OC, {IW}, G, {kernel}, {stride}, {pad, pad}); + + int kernel_dim_1d = kernel; + + aligned_vector Bint8_1d( + kernel_dim_1d * conv_p_1d.IC * (conv_p_1d.OC / conv_p_1d.G)); + aligned_vector Bint8_1d_unpacked( + kernel_dim_1d * conv_p_1d.IC * (conv_p_1d.OC / conv_p_1d.G)); + + PackWeightsForConv<1> packedB_1D(conv_p_1d, Bint8_1d.data()); + + packedB_1D.unpack(Bint8_1d_unpacked.data()); + + ASSERT_EQ(Bint8_1d, Bint8_1d_unpacked) + << "Original and unpacked data elements are not the same [1D]"; + conv_param_t<2> conv_p_2d( MB, IC, @@ -399,27 +497,30 @@ TEST(uniConvTest, cornerCases) { * @brief Unit test for uint8 activations, int8 weights, and 32-bit * accumulation. Output processing: requantization -> nothing */ -TEST_P(UniConvQGranTest, requantizeTest) { - vector> shapes(GetShapes_()); - QuantizationGranularity q_granularity; - bool a_symmetric, b_symmetric; - bool test_bias, test_float_bias; - tie(q_granularity, a_symmetric, b_symmetric, test_bias, test_float_bias) = - GetParam(); + +template +void runRequantizeTest( + QuantizationGranularity q_granularity, + bool a_symmetric, + bool b_symmetric, + bool test_bias, + bool test_float_bias) { + vector> shapes(GetShapes_()); for (auto conv_p : shapes) { - int R = conv_p.K[0]; - int S = conv_p.K[1]; + int R = SPATIAL_DIM == 1 ? 1 : conv_p.K[SPATIAL_DIM - 2]; + int S = conv_p.K[SPATIAL_DIM - 1]; int G = conv_p.G; int OC = conv_p.OC; - int OH = conv_p.OUT_DIM[0]; - int OW = conv_p.OUT_DIM[1]; + int OH = SPATIAL_DIM == 1 ? 1 : conv_p.OUT_DIM[SPATIAL_DIM - 2]; + int OW = conv_p.OUT_DIM[SPATIAL_DIM - 1]; int IC_per_G = conv_p.IC / conv_p.G; int OC_per_G = conv_p.OC / conv_p.G; + int IH = SPATIAL_DIM == 1 ? 1 : conv_p.IN_DIM[SPATIAL_DIM - 2]; + int IW = conv_p.IN_DIM[SPATIAL_DIM - 1]; // activations - aligned_vector Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + aligned_vector Aint8(conv_p.MB * IH * IW * conv_p.IC, 0); // weights // The weight matrix is in layout G K/G (R S C/G) @@ -550,7 +651,7 @@ TEST_P(UniConvQGranTest, requantizeTest) { ncols_per_quant_group); } - PackWeightsForConv<2> packedWeights(conv_p, Bint8.data()); + PackWeightsForConv packedWeights(conv_p, Bint8.data()); // TODO: Uncomment once we support multiple threads in fbgemmGroupwiseConv // #ifdef _OPENMP @@ -724,3 +825,16 @@ TEST_P(UniConvQGranTest, requantizeTest) { static_cast(0)); } // for each shape } + +TEST_P(UniConvQGranTest, requantizeTest) { + QuantizationGranularity q_granularity; + bool a_symmetric, b_symmetric; + bool test_bias, test_float_bias; + tie(q_granularity, a_symmetric, b_symmetric, test_bias, test_float_bias) = + GetParam(); + + runRequantizeTest<1>( + q_granularity, a_symmetric, b_symmetric, test_bias, test_float_bias); + runRequantizeTest<2>( + q_granularity, a_symmetric, b_symmetric, test_bias, test_float_bias); +}