[PyTorch Edge][QNNPack] Enable Depthwise Specific Conv3d Kernel for K…

…ernel Size 3x3x3 (pytorch#69315) Summary: Pull Request resolved: pytorch#69315 Uses kernels and setup modifications from earlier diffs in this stack ghstack-source-id: 146346780 Test Plan: **Correctness** - Test using QNNPack Operator-Level Test: -- Neon Kernel: As in test plan of D32217846, all tests pass -- SSE2 Kernel: ```buck test xplat/caffe2/aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack_test```, all tests pass - Test by Printing Results of Model-Level Test: D32122020 **Performance** *Operator Level tests from convolution.cc in D32217846* ||Before (V23 of D32217846, without newly added kernel)|After (V48 of D31966574, with newly added kernel)| |depthwise 3x3x3 static|184 ms|134 ms| |depthwise 3x3x3 runtime|181 ms|134 ms| |depthwise 3x3x3s2 static|30 ms|22 ms| |depthwise 3x3x3s2 runtime|30 ms|23 ms| |depthwise 3x3x3s1x2 static|97 ms|70 ms| |depthwise 3x3x3s1x2 runtime|96 ms|70 ms| |depthwise 3x3x3s2x1 static|53 ms|38 ms| |depthwise 3x3x3s2x1 runtime|53 ms|38 ms| |depthwise 3x3x3d2 static|104 ms|74 ms| |depthwise 3x3x3d2 runtime|103 ms|75 ms| |depthwise 3x3x3d1x2 static|158 ms|116 ms| |depthwise 3x3x3d1x2 runtime|157 ms|115 ms| |depthwise 3x3x3d2x1 static|120 ms|86 ms| |depthwise 3x3x3d2x1 runtime|120 ms|87 ms| |depthwise 3x3x3 per channel static|182 ms|134 ms| |depthwise 3x3x3 per channel runtime|184 ms|134 ms| |depthwise 3x3x3s2 per channel static|30 ms|22 ms| |depthwise 3x3x3s2 per channel runtime|31 ms|23 ms| |depthwise 3x3x3s1x2 per channel static|95 ms|70 ms| |depthwise 3x3x3s1x2 per channel runtime|95 ms|71 ms| |depthwise 3x3x3s2x1 per channel static|53 ms|39 ms| |depthwise 3x3x3s2x1 per channel runtime|55 ms|39 ms| |depthwise 3x3x3d2 per channel static|105 ms|75 ms| |depthwise 3x3x3d2 per channel runtime|103 ms|75 ms| |depthwise 3x3x3d1x2 per channel static|158 ms|116 ms| |depthwise 3x3x3d1x2 per channel runtime|158 ms|116 ms| |depthwise 3x3x3d2x1 per channel static|118 ms|87 ms| |depthwise 3x3x3d2x1 per channel runtime|119 ms|87 ms| Average Change: -36.96% (Generated with https://www.internalfb.com/intern/anp/view/?id=1371846&revision_id=291376782898627) *Model Level Test on Synthesized Conv3d Model* Model Details: - 21 channels, input size: 9 x 12 x 7, kernel size: 3x3x3 - Config added in D31928710 - Model generated with https://www.internalfb.com/intern/anp/view/?id=1313660&revision_id=248658657303993 ```buck run aibench:run_bench -- -b dw_conv_3d_3x3x3_big_2b.json --platform android/arm64 --framework pytorch --remote --devices Pixel-4a-11-30``` - Before (V23 of D32217846): [0.0935 ms](https://our.intern.facebook.com/intern/aibench/details/768298420366437) - After (V48 of D31966574): [0.0665 ms](https://our.intern.facebook.com/intern/aibench/details/67271954298132) (29% faster) * Model Level Test on Video Model-like Inputs (provided by liyilui) * - D33000199 - 87.5% faster Reviewed By: kimishpatel Differential Revision: D31966574 fbshipit-source-id: 6554a878401c1120054f6b02241456e8fb44b152
mzl9039 · Dec 30, 2021 · 9c742be · 9c742be
1 parent 3d4590d
commit 9c742be
Show file tree

Hide file tree

Showing 9 changed files with 546 additions and 135 deletions.
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-prepack.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-prepack.cc
@@ -96,6 +96,62 @@ PrePackConvWeights::PrePackConvWeights(
                   (20 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride,
               false);
           break;
+        case 27:
+          pytorch_pack_q8dw_3d_w_dilation(
+              kernel_depth,
+              kernel_height,
+              kernel_width,
+              groups,
+              cr,
+              0,
+              kernel_depth,
+              0,
+              kernel_height,
+              0,
+              1,
+              kernel,
+              bias,
+              packed_weights_,
+              true);
+          pytorch_pack_q8dw_3d_w_dilation(
+              kernel_depth,
+              kernel_height,
+              kernel_width,
+              groups,
+              cr,
+              0,
+              kernel_depth,
+              0,
+              kernel_height,
+              1,
+              2,
+              kernel,
+              bias,
+              (char*)packed_weights_ +
+                  (kernel_depth * kernel_height +
+                   sizeof(int32_t) / sizeof(uint8_t)) *
+                      c_stride,
+              false);
+          pytorch_pack_q8dw_3d_w_dilation(
+              kernel_depth,
+              kernel_height,
+              kernel_width,
+              groups,
+              cr,
+              0,
+              kernel_depth,
+              0,
+              kernel_height,
+              2,
+              3,
+              kernel,
+              bias,
+              (char*)packed_weights_ +
+                  (2 * kernel_depth * kernel_height +
+                   sizeof(int32_t) / sizeof(uint8_t)) *
+                      c_stride,
+              false);
+          break;
         default:
           PYTORCH_QNNP_UNREACHABLE;
       }

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc
@@ -205,7 +205,7 @@ static void compute_sum_rows(
           block_start);
 }
 
-struct q8dwconv_context {
+struct q8dwconv2d_context {
   size_t groups;
   size_t group_stride;
   const uint8_t** indirection_buffer;
@@ -218,11 +218,29 @@ struct q8dwconv_context {
   size_t output_row_stride;
   size_t output_col_increment;
   union pytorch_qnnp_conv_quantization_params quantization_params;
-  const pytorch_q8dwconv_up_ukernel_function unipass_ukernel;
-  const pytorch_q8dwconv_mp_ukernel_function multipass_ukernel;
+  const pytorch_q8dwconv2d_up_ukernel_function unipass_ukernel;
+  const pytorch_q8dwconv2d_mp_ukernel_function multipass_ukernel;
 };
-static void compute_dwconv_unipass(
-    const struct q8dwconv_context context[1],
+
+struct q8dwconv3d_context {
+  size_t groups;
+  size_t group_stride;
+  const uint8_t** indirection_buffer;
+  size_t indirection_buffer_slice_stride;
+  size_t indirection_buffer_row_stride;
+  size_t indirection_buffer_col_stride;
+  const void* packed_weights;
+  uint8_t* output;
+  size_t output_depth;
+  size_t output_height;
+  size_t output_width;
+  size_t output_slice_stride;
+  union pytorch_qnnp_conv_quantization_params quantization_params;
+  const pytorch_q8dwconv3d_mp_ukernel_function multipass_ukernel;
+};
+
+static void compute_dwconv2d_unipass(
+    const struct q8dwconv2d_context context[1],
     size_t image,
     size_t output_y) {
   const size_t output_height = context->output_height;
@@ -240,8 +258,8 @@ static void compute_dwconv_unipass(
       context->output_col_increment,
       &context->quantization_params);
 }
-static void compute_dwconv_multiipass(
-    const struct q8dwconv_context context[1],
+static void compute_dwconv2d_multiipass(
+    const struct q8dwconv2d_context context[1],
     size_t image,
     size_t output_y) {
   const size_t output_height = context->output_height;
@@ -271,6 +289,40 @@ static void compute_dwconv_multiipass(
 #endif
 }
 
+static void compute_dwconv3d_multiipass(
+    const struct q8dwconv3d_context context[1],
+    size_t image,
+    size_t output_z) {
+  const size_t output_depth = context->output_depth;
+  PYTORCH_QNNP_ALIGN(16)
+#ifdef _MSC_VER
+  int32_t* multipass_acc =
+      (int32_t*)_malloca(sizeof(int32_t) * context->group_stride);
+#else
+  int32_t multipass_acc[context->group_stride];
+#endif
+
+  context->multipass_ukernel(
+      context->groups,
+      context->output_height,
+      context->output_width,
+      context->indirection_buffer +
+          (image * output_depth + output_z) *
+              context->indirection_buffer_slice_stride,
+      context->packed_weights,
+      multipass_acc,
+      context->output +
+          (image * output_depth + output_z) * context->output_slice_stride,
+      context->indirection_buffer_row_stride,
+      context->indirection_buffer_col_stride,
+      0,
+      &context->quantization_params);
+
+#ifdef _MSC_VER
+  _freea(multipass_acc);
+#endif
+}
+
 struct QnnpackDeleter {
   void operator()(pytorch_qnnp_operator_t op) {
     pytorch_qnnp_delete_operator(op);
@@ -366,7 +418,7 @@ enum pytorch_qnnp_status qnnpackConv(
 
       switch (kernel_size) {
         case 9: {
-          struct q8dwconv_context context = {
+          struct q8dwconv2d_context context = {
               .groups = groups,
               .group_stride = group_stride,
               .indirection_buffer =
@@ -392,14 +444,14 @@ enum pytorch_qnnp_status qnnpackConv(
           };
           pthreadpool_compute_2d(
               threadpool,
-              (pthreadpool_function_2d_t)compute_dwconv_unipass,
+              (pthreadpool_function_2d_t)compute_dwconv2d_unipass,
               &context,
               batch_size,
               convolution->output_height);
           break;
         }
         case 25: {
-          struct q8dwconv_context context = {
+          struct q8dwconv2d_context context = {
               .groups = groups,
               .group_stride = group_stride,
               .indirection_buffer =
@@ -425,12 +477,41 @@ enum pytorch_qnnp_status qnnpackConv(
           };
           pthreadpool_compute_2d(
               threadpool,
-              (pthreadpool_function_2d_t)compute_dwconv_multiipass,
+              (pthreadpool_function_2d_t)compute_dwconv2d_multiipass,
               &context,
               batch_size,
               convolution->output_height);
           break;
         }
+        case 27: {
+          struct q8dwconv3d_context context = {
+              .groups = groups,
+              .group_stride = group_stride,
+              .indirection_buffer =
+                  (const uint8_t**)convolution->indirection_buffer,
+              .indirection_buffer_slice_stride =
+                  step_height * convolution->output_height,
+              .indirection_buffer_row_stride = step_height * sizeof(void*),
+              .indirection_buffer_col_stride =
+                  kernel_height * kernel_depth * step_width * sizeof(void*),
+              .packed_weights = packed_weights,
+              .output = output,
+              .output_depth = convolution->output_depth,
+              .output_height = convolution->output_height,
+              .output_width = convolution->output_width,
+              .output_slice_stride = convolution->output_height *
+                  convolution->output_width * output_pixel_stride,
+              .quantization_params = conv_quantization_params,
+              .multipass_ukernel = pytorch_qnnp_params.q8dw27.mpdw,
+          };
+          pthreadpool_compute_2d(
+              threadpool,
+              (pthreadpool_function_2d_t)compute_dwconv3d_multiipass,
+              &context,
+              batch_size,
+              convolution->output_depth);
+          break;
+        }
         default:
           PYTORCH_QNNP_UNREACHABLE;
       }