Add conv_1d (pytorch#369)

Summary: Pull Request resolved: pytorch#369 1. Add Conv 1D implementation. Currently, 1D still follows 2D approach, using packing and gemm ops. Only im2col is supported now. 2. Add test case in test/UniConvTest.cc 3. Add test case in bench/ConvUnifiedBenchmark.cc 4 Todo: take1DFastPath Reviewed By: dskhudia Differential Revision: D21460180 fbshipit-source-id: 992f7b4dc40e9878c8951b4dfd636fe7585c0a8f
joebos · May 15, 2020 · 17b31be · 17b31be
1 parent 46981b8
commit 17b31be
Show file tree

Hide file tree

Showing 14 changed files with 405 additions and 85 deletions.
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
@@ -25,6 +25,17 @@ using namespace std;
 using namespace fbgemm;
 
 // clang-format off
+// 1D conv shapes
+vector<conv_param_t<1>> shapes_1d = {
+  // MB, IC, OC, IW, G, KW, stride_w, pad_w_left, pad_w_right
+  // regular
+  conv_param_t<1>(1, 600, 100, {1}, 1, {3}, {1}, {2, 2}),
+  conv_param_t<1>(1, 600, 100, {2}, 1, {3}, {1}, {2, 2}),
+  conv_param_t<1>(1, 600, 100, {3}, 1, {3}, {1}, {2, 2}),
+  conv_param_t<1>(1, 200, 162, {1}, 1, {3}, {1}, {2, 2}),
+  conv_param_t<1>(1, 600, 100, {4}, 1, {3}, {1}, {2, 2})
+};
+
 // 2D conv shapes
 vector<conv_param_t<2>> shapes_2d = {
   // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
@@ -119,23 +130,38 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
   if (SPATIAL_DIM == 3) {
     header += "IT, ";
   }
-  header += "IH, IW, G, ";
+  if (SPATIAL_DIM > 1) {
+    header += "IH, ";
+  }
+  header += "IW, G, ";
   if (SPATIAL_DIM == 3) {
     header += "KT, ";
   }
-  header += "KH, KW, ";
+  if (SPATIAL_DIM > 1) {
+    header += "KH, ";
+  }
+  header += "KW, ";
   if (SPATIAL_DIM == 3) {
     header += "stride_t, ";
   }
-  header += "stride_h, stride_w, ";
+  if (SPATIAL_DIM > 1) {
+    header += "stride_h, ";
+  }
+  header += "stride_w, ";
   if (SPATIAL_DIM == 3) {
     header += "pad_t, ";
   }
-  header += "pad_h, pad_w, ";
+  if (SPATIAL_DIM > 1) {
+    header += "pad_h, ";
+  }
+  header += "pad_w, ";
   if (SPATIAL_DIM == 3) {
     header += "dilation_t, ";
   }
-  header += "dilation_h, dilation_w, ";
+  if (SPATIAL_DIM > 1) {
+    header += "dilation_h, ";
+  }
+  header += "dilation_w, ";
 
   header += "Type, M, N, K, ";
 
@@ -375,6 +401,7 @@ int main() {
   }
 #endif
   // performance_test<int16_t>();
+  performance_test<1, int32_t>(shapes_1d);
   performance_test<2, int32_t>(shapes_2d);
   performance_test<3, int32_t>(shapes_3d);
   return 0;

diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -46,7 +46,13 @@ enum class inst_set_t { anyarch, avx2, avx512, avx512_ymm, avx512_vnni };
 /**
  * @brief Typed enum for optimized paths for convolutions
  */
-enum class optimized_conv_t { depthwise, groupwise, pointwise, im2col };
+enum class optimized_conv_t {
+  depthwise,
+  groupwise,
+  pointwise,
+  fastpath1d,
+  im2col
+};
 
 /**
  * @brief Typed enum for implementation type.

diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
@@ -388,6 +388,7 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 1);       \
   INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2);       \
   INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3);
 
@@ -449,6 +450,7 @@ INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset);
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 1);       \
   INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2);       \
   INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3);
 
@@ -546,6 +548,7 @@ INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset);
       memCopy<>>;
 
 #define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 1);          \
   INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2);          \
   INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3);
 

diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
@@ -206,7 +206,9 @@ void fbgemmPacked(
 
 template <int SPATIAL_DIM>
 bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
-  static_assert(SPATIAL_DIM >= 2, "Unsupported spatial dims");
+
+  if (SPATIAL_DIM == 1) return false;
+
   int C_per_G = conv_p.IC / conv_p.G;
   int K_per_G = conv_p.OC / conv_p.G;
 
@@ -247,6 +249,7 @@ bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p) {
            std::bind(areEqual, std::placeholders::_1, 2)));
 }
 
+template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<1>& conv_p);
 template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<2>& conv_p);
 template FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<3>& conv_p);
 
@@ -383,6 +386,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 1);       \
   INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
   INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
 
@@ -451,6 +455,7 @@ INSTANTIATE_RELU(PackAWithQuantRowOffset);
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 1);       \
   INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
   INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
 
@@ -588,6 +593,7 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
       const BlockingFactors* blocking_params);
 
 #define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_BASE(ACC_T, 1);          \
   INSTANTIATE_BASE(ACC_T, 2);          \
   INSTANTIATE_BASE(ACC_T, 3);
 

diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
@@ -49,6 +49,11 @@ bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
       std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0;
 }
 
+template <int SPATIAL_DIM>
+bool take1DFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
+  return false;
+}
+
 template <int SPATIAL_DIM, typename ACC_T>
 optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
   if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
@@ -57,6 +62,8 @@ optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
     return optimized_conv_t::groupwise;
   } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) {
     return optimized_conv_t::pointwise;
+  } else if (take1DFastPath<SPATIAL_DIM>(conv_p)) {
+    return optimized_conv_t::fastpath1d;
   } else {
     return optimized_conv_t::im2col;
   }
@@ -73,10 +80,6 @@ int fbgemmConv(
     int thread_id,
     int num_threads,
     const BlockingFactors* blocking_params) {
-  static_assert(
-      SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
-      "Only 2D and 3D convolutions are supported");
-
   if (!packed_weights.isPackingCompliant(conv_p)) {
     std::string msg =
         "[FBGEMM_CONV_ERROR] Convolution parameters "
@@ -317,6 +320,9 @@ int fbgemmConv(
           blocking_params);
       break;
     }
+    case optimized_conv_t::fastpath1d: {
+      break;
+    }
     case optimized_conv_t::im2col: {
       // All other convolutions go through im2col-based implementation
       // std::cout << "Im2col path" << std::endl;
@@ -391,6 +397,7 @@ int fbgemmConv(
   INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);
 
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
+  INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 1);        \
   INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2);        \
   INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);
 
@@ -420,10 +427,15 @@ template bool takeDepthWiseFastPath<2, std::int16_t>(
 template bool takeDepthWiseFastPath<3, std::int16_t>(
     const conv_param_t<3>& conv_p);
 
+template FBGEMM_API optimized_conv_t
+ConvFastPath<1, std::int32_t>(const conv_param_t<1>& conv_p);
 template FBGEMM_API optimized_conv_t
 ConvFastPath<2, std::int32_t>(const conv_param_t<2>& conv_p);
 template FBGEMM_API optimized_conv_t
 ConvFastPath<3, std::int32_t>(const conv_param_t<3>& conv_p);
+
+template FBGEMM_API optimized_conv_t
+ConvFastPath<1, std::int16_t>(const conv_param_t<1>& conv_p);
 template FBGEMM_API optimized_conv_t
 ConvFastPath<2, std::int16_t>(const conv_param_t<2>& conv_p);
 template FBGEMM_API optimized_conv_t

diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
@@ -1453,11 +1453,11 @@ void fbgemmGroupwiseConv(
   }
 
   int MB = conv_param.MB;
-  int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
-  int OH = conv_param.OUT_DIM[SPATIAL_DIM - 2];
+  int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
+  int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2];
   int OW = conv_param.OUT_DIM[SPATIAL_DIM - 1];
-  int T = SPATIAL_DIM == 2 ? 1 : conv_param.K[SPATIAL_DIM - 3];
-  int R = conv_param.K[SPATIAL_DIM - 2];
+  int T = SPATIAL_DIM <= 2 ? 1 : conv_param.K[SPATIAL_DIM - 3];
+  int R = SPATIAL_DIM == 1 ? 1 : conv_param.K[SPATIAL_DIM - 2];
   int S = conv_param.K[SPATIAL_DIM - 1];
   int G = conv_param.G;
   int OC = conv_param.OC;
@@ -1466,8 +1466,8 @@ void fbgemmGroupwiseConv(
   int C_per_G = conv_param.IC / G;
   int OH_OW = OH * OW;
   int OT_OH_OW = OT * OH * OW;
-  int IT = SPATIAL_DIM == 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3];
-  int IH = conv_param.IN_DIM[SPATIAL_DIM - 2];
+  int IT = SPATIAL_DIM <= 2 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 3];
+  int IH = SPATIAL_DIM == 1 ? 1 : conv_param.IN_DIM[SPATIAL_DIM - 2];
   int IW = conv_param.IN_DIM[SPATIAL_DIM - 1];
   int IH_IW = IH * IW;
   int IT_IH_IW = IT * IH * IW;
@@ -1479,6 +1479,9 @@ void fbgemmGroupwiseConv(
   int G_together = PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>::
       numOfGroupsTogether(conv_param);
 
+  if (SPATIAL_DIM == 1) {
+    throw std::runtime_error("Groupwise 1D not implemented!");
+  }
   if (SPATIAL_DIM == 2) {
     // Parallelization:
     int batch_start = 0;
@@ -1558,10 +1561,11 @@ void fbgemmGroupwiseConv(
             rowOffsetBuf_start_group);
 
         const int32_t* inp = out_start_group;
-        block_type_t block{i * OT_OH_OW + oh_start * OW,
-                           (oh_end - oh_start) * OW,
-                           g * K_per_G,
-                           G_together * K_per_G};
+        block_type_t block{
+            i * OT_OH_OW + oh_start * OW,
+            (oh_end - oh_start) * OW,
+            g * K_per_G,
+            G_together * K_per_G};
         int ld_out = G * K_per_G;
         int ld_in = G * K_per_G;
 
@@ -1700,10 +1704,11 @@ void fbgemmGroupwiseConv(
           }
 
           const int32_t* inp = out_start_t;
-          block_type_t block{i * OT_OH_OW + oh_start * OW,
-                             (oh_end - oh_start) * OW,
-                             g * K_per_G,
-                             G_together * K_per_G};
+          block_type_t block{
+              i * OT_OH_OW + oh_start * OW,
+              (oh_end - oh_start) * OW,
+              g * K_per_G,
+              G_together * K_per_G};
           int ld_out = G * K_per_G;
           int ld_in = G * K_per_G;
 
@@ -1729,9 +1734,9 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
   // row offset buffer should be a able to hold row offsets for however
   // number of groups we process at a time.
   if (cpuinfo_initialize()) {
-    int OT = SPATIAL_DIM == 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
-    int bufferSize = OT * conv_param.OUT_DIM[SPATIAL_DIM - 2] *
-        conv_param.OUT_DIM[SPATIAL_DIM - 1];
+    int OT = SPATIAL_DIM <= 2 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 3];
+    int OH = SPATIAL_DIM == 1 ? 1 : conv_param.OUT_DIM[SPATIAL_DIM - 2];
+    int bufferSize = OT * OH * conv_param.OUT_DIM[SPATIAL_DIM - 1];
     if (fbgemmHasAvx512Support()) {
       return conv_param.MB * bufferSize * conv_param.G;
     } else if (fbgemmHasAvx2Support()) {
@@ -1746,6 +1751,8 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
   }
 }
 
+template FBGEMM_API int rowOffsetBufferSizeGConv<1>(
+    const conv_param_t<1>& conv_param);
 template FBGEMM_API int rowOffsetBufferSizeGConv<2>(
     const conv_param_t<2>& conv_param);
 template FBGEMM_API int rowOffsetBufferSizeGConv<3>(
@@ -1769,6 +1776,7 @@ template FBGEMM_API int rowOffsetBufferSizeGConv<3>(
   INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t);
 
 #define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \
+  INSTANTIATE_BIAS_T(RELU, Q_GRAN, 1);        \
   INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2);        \
   INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3);