Skip to content

Commit

Permalink
[PyTorch Edge][QNNPack] Enable Depthwise Specific Conv3d Kernel for K…
Browse files Browse the repository at this point in the history
…ernel Size 3x3x3 (pytorch#69315)

Summary:
Pull Request resolved: pytorch#69315

Uses kernels and setup modifications from earlier diffs in this stack
ghstack-source-id: 146346780

Test Plan:
**Correctness**
- Test using QNNPack Operator-Level Test:
-- Neon Kernel: As in test plan of D32217846, all tests pass
-- SSE2 Kernel: ```buck test xplat/caffe2/aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack_test```, all tests pass
- Test by Printing Results of Model-Level Test: D32122020

**Performance**

*Operator Level tests from convolution.cc in D32217846*
||Before (V23 of D32217846, without newly added kernel)|After (V48 of D31966574, with newly added kernel)|
|depthwise 3x3x3 static|184 ms|134 ms|
|depthwise 3x3x3 runtime|181 ms|134 ms|
|depthwise 3x3x3s2 static|30 ms|22 ms|
|depthwise 3x3x3s2 runtime|30 ms|23 ms|
|depthwise 3x3x3s1x2 static|97 ms|70 ms|
|depthwise 3x3x3s1x2 runtime|96 ms|70 ms|
|depthwise 3x3x3s2x1 static|53 ms|38 ms|
|depthwise 3x3x3s2x1 runtime|53 ms|38 ms|
|depthwise 3x3x3d2 static|104 ms|74 ms|
|depthwise 3x3x3d2 runtime|103 ms|75 ms|
|depthwise 3x3x3d1x2 static|158 ms|116 ms|
|depthwise 3x3x3d1x2 runtime|157 ms|115 ms|
|depthwise 3x3x3d2x1 static|120 ms|86 ms|
|depthwise 3x3x3d2x1 runtime|120 ms|87 ms|
|depthwise 3x3x3 per channel static|182 ms|134 ms|
|depthwise 3x3x3 per channel runtime|184 ms|134 ms|
|depthwise 3x3x3s2 per channel static|30 ms|22 ms|
|depthwise 3x3x3s2 per channel runtime|31 ms|23 ms|
|depthwise 3x3x3s1x2 per channel static|95 ms|70 ms|
|depthwise 3x3x3s1x2 per channel runtime|95 ms|71 ms|
|depthwise 3x3x3s2x1 per channel static|53 ms|39 ms|
|depthwise 3x3x3s2x1 per channel runtime|55 ms|39 ms|
|depthwise 3x3x3d2 per channel static|105 ms|75 ms|
|depthwise 3x3x3d2 per channel runtime|103 ms|75 ms|
|depthwise 3x3x3d1x2 per channel static|158 ms|116 ms|
|depthwise 3x3x3d1x2 per channel runtime|158 ms|116 ms|
|depthwise 3x3x3d2x1 per channel static|118 ms|87 ms|
|depthwise 3x3x3d2x1 per channel runtime|119 ms|87 ms|

Average Change: -36.96%

(Generated with https://www.internalfb.com/intern/anp/view/?id=1371846&revision_id=291376782898627)

*Model Level Test on Synthesized Conv3d Model*

Model Details:
- 21 channels, input size: 9 x 12 x 7, kernel size: 3x3x3
- Config added in D31928710
- Model generated with https://www.internalfb.com/intern/anp/view/?id=1313660&revision_id=248658657303993

```buck run aibench:run_bench -- -b dw_conv_3d_3x3x3_big_2b.json --platform android/arm64 --framework pytorch --remote --devices Pixel-4a-11-30```

- Before (V23 of D32217846): [0.0935 ms](https://our.intern.facebook.com/intern/aibench/details/768298420366437)
- After (V48 of D31966574): [0.0665 ms](https://our.intern.facebook.com/intern/aibench/details/67271954298132)
(29% faster)

* Model Level Test on Video Model-like Inputs (provided by liyilui) *
- D33000199
- 87.5% faster

Reviewed By: kimishpatel

Differential Revision: D31966574

fbshipit-source-id: 6554a878401c1120054f6b02241456e8fb44b152
  • Loading branch information
salilsdesai authored and facebook-github-bot committed Dec 30, 2021
1 parent 3d4590d commit 9c742be
Show file tree
Hide file tree
Showing 9 changed files with 546 additions and 135 deletions.
56 changes: 56 additions & 0 deletions aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-prepack.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,62 @@ PrePackConvWeights::PrePackConvWeights(
(20 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride,
false);
break;
case 27:
pytorch_pack_q8dw_3d_w_dilation(
kernel_depth,
kernel_height,
kernel_width,
groups,
cr,
0,
kernel_depth,
0,
kernel_height,
0,
1,
kernel,
bias,
packed_weights_,
true);
pytorch_pack_q8dw_3d_w_dilation(
kernel_depth,
kernel_height,
kernel_width,
groups,
cr,
0,
kernel_depth,
0,
kernel_height,
1,
2,
kernel,
bias,
(char*)packed_weights_ +
(kernel_depth * kernel_height +
sizeof(int32_t) / sizeof(uint8_t)) *
c_stride,
false);
pytorch_pack_q8dw_3d_w_dilation(
kernel_depth,
kernel_height,
kernel_width,
groups,
cr,
0,
kernel_depth,
0,
kernel_height,
2,
3,
kernel,
bias,
(char*)packed_weights_ +
(2 * kernel_depth * kernel_height +
sizeof(int32_t) / sizeof(uint8_t)) *
c_stride,
false);
break;
default:
PYTORCH_QNNP_UNREACHABLE;
}
Expand Down
103 changes: 92 additions & 11 deletions aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ static void compute_sum_rows(
block_start);
}

struct q8dwconv_context {
struct q8dwconv2d_context {
size_t groups;
size_t group_stride;
const uint8_t** indirection_buffer;
Expand All @@ -218,11 +218,29 @@ struct q8dwconv_context {
size_t output_row_stride;
size_t output_col_increment;
union pytorch_qnnp_conv_quantization_params quantization_params;
const pytorch_q8dwconv_up_ukernel_function unipass_ukernel;
const pytorch_q8dwconv_mp_ukernel_function multipass_ukernel;
const pytorch_q8dwconv2d_up_ukernel_function unipass_ukernel;
const pytorch_q8dwconv2d_mp_ukernel_function multipass_ukernel;
};
static void compute_dwconv_unipass(
const struct q8dwconv_context context[1],

struct q8dwconv3d_context {
size_t groups;
size_t group_stride;
const uint8_t** indirection_buffer;
size_t indirection_buffer_slice_stride;
size_t indirection_buffer_row_stride;
size_t indirection_buffer_col_stride;
const void* packed_weights;
uint8_t* output;
size_t output_depth;
size_t output_height;
size_t output_width;
size_t output_slice_stride;
union pytorch_qnnp_conv_quantization_params quantization_params;
const pytorch_q8dwconv3d_mp_ukernel_function multipass_ukernel;
};

static void compute_dwconv2d_unipass(
const struct q8dwconv2d_context context[1],
size_t image,
size_t output_y) {
const size_t output_height = context->output_height;
Expand All @@ -240,8 +258,8 @@ static void compute_dwconv_unipass(
context->output_col_increment,
&context->quantization_params);
}
static void compute_dwconv_multiipass(
const struct q8dwconv_context context[1],
static void compute_dwconv2d_multiipass(
const struct q8dwconv2d_context context[1],
size_t image,
size_t output_y) {
const size_t output_height = context->output_height;
Expand Down Expand Up @@ -271,6 +289,40 @@ static void compute_dwconv_multiipass(
#endif
}

static void compute_dwconv3d_multiipass(
const struct q8dwconv3d_context context[1],
size_t image,
size_t output_z) {
const size_t output_depth = context->output_depth;
PYTORCH_QNNP_ALIGN(16)
#ifdef _MSC_VER
int32_t* multipass_acc =
(int32_t*)_malloca(sizeof(int32_t) * context->group_stride);
#else
int32_t multipass_acc[context->group_stride];
#endif

context->multipass_ukernel(
context->groups,
context->output_height,
context->output_width,
context->indirection_buffer +
(image * output_depth + output_z) *
context->indirection_buffer_slice_stride,
context->packed_weights,
multipass_acc,
context->output +
(image * output_depth + output_z) * context->output_slice_stride,
context->indirection_buffer_row_stride,
context->indirection_buffer_col_stride,
0,
&context->quantization_params);

#ifdef _MSC_VER
_freea(multipass_acc);
#endif
}

struct QnnpackDeleter {
void operator()(pytorch_qnnp_operator_t op) {
pytorch_qnnp_delete_operator(op);
Expand Down Expand Up @@ -366,7 +418,7 @@ enum pytorch_qnnp_status qnnpackConv(

switch (kernel_size) {
case 9: {
struct q8dwconv_context context = {
struct q8dwconv2d_context context = {
.groups = groups,
.group_stride = group_stride,
.indirection_buffer =
Expand All @@ -392,14 +444,14 @@ enum pytorch_qnnp_status qnnpackConv(
};
pthreadpool_compute_2d(
threadpool,
(pthreadpool_function_2d_t)compute_dwconv_unipass,
(pthreadpool_function_2d_t)compute_dwconv2d_unipass,
&context,
batch_size,
convolution->output_height);
break;
}
case 25: {
struct q8dwconv_context context = {
struct q8dwconv2d_context context = {
.groups = groups,
.group_stride = group_stride,
.indirection_buffer =
Expand All @@ -425,12 +477,41 @@ enum pytorch_qnnp_status qnnpackConv(
};
pthreadpool_compute_2d(
threadpool,
(pthreadpool_function_2d_t)compute_dwconv_multiipass,
(pthreadpool_function_2d_t)compute_dwconv2d_multiipass,
&context,
batch_size,
convolution->output_height);
break;
}
case 27: {
struct q8dwconv3d_context context = {
.groups = groups,
.group_stride = group_stride,
.indirection_buffer =
(const uint8_t**)convolution->indirection_buffer,
.indirection_buffer_slice_stride =
step_height * convolution->output_height,
.indirection_buffer_row_stride = step_height * sizeof(void*),
.indirection_buffer_col_stride =
kernel_height * kernel_depth * step_width * sizeof(void*),
.packed_weights = packed_weights,
.output = output,
.output_depth = convolution->output_depth,
.output_height = convolution->output_height,
.output_width = convolution->output_width,
.output_slice_stride = convolution->output_height *
convolution->output_width * output_pixel_stride,
.quantization_params = conv_quantization_params,
.multipass_ukernel = pytorch_qnnp_params.q8dw27.mpdw,
};
pthreadpool_compute_2d(
threadpool,
(pthreadpool_function_2d_t)compute_dwconv3d_multiipass,
&context,
batch_size,
convolution->output_depth);
break;
}
default:
PYTORCH_QNNP_UNREACHABLE;
}
Expand Down
Loading

0 comments on commit 9c742be

Please sign in to comment.