optimize UpSampleNearest 1d 2d and 3d performance on CPU (pytorch#31452)

Summary: This PR aims at improving `UpSample` performance with `mode='nearest'` on 1D 2D and 3D, both inference and training are covered. Current implementation from 'ATen' doesn't have parallelization. 1. single socket inference speedup for 1d, 2d and 3d: **63x, 57x, 46x**. 2. single core inference speedup for 1d, 2d and 3d: **5.9x, 4.6x, 3.4x**. 3. dual sockets training speedup for 1d, 2d and 3d: **38x, 33x, 65x** Pull Request resolved: pytorch#31452 Differential Revision: D20077828 Pulled By: VitalyFedyunin fbshipit-source-id: a7815cf2ae344696067d2ec63bd4f4e858eaafff
mzl9039 · Mar 4, 2020 · 39f78db · 39f78db
1 parent 112cecc
commit 39f78db
Show file tree

Hide file tree

Showing 5 changed files with 352 additions and 492 deletions.
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/native/DispatchStub.h>
 
 
 /**
@@ -44,6 +45,17 @@
 namespace at {
 namespace native {
 
+using scale_t = c10::optional<double>;
+using upsampling_1d = void(*)(Tensor& output, const Tensor& input, scale_t scales_w);
+using upsampling_2d = void(*)(Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using upsampling_3d = void(*)(Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+DECLARE_DISPATCH(upsampling_1d, upsample_nearest1d_kernel);
+DECLARE_DISPATCH(upsampling_2d, upsample_nearest2d_kernel);
+DECLARE_DISPATCH(upsampling_3d, upsample_nearest3d_kernel);
+DECLARE_DISPATCH(upsampling_1d, upsample_nearest1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_2d, upsample_nearest2d_backward_kernel);
+DECLARE_DISPATCH(upsampling_3d, upsample_nearest3d_backward_kernel);
+
 static inline void upsample_1d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,

diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -6,95 +6,9 @@ namespace at {
 namespace native {
 namespace {
 
-template <typename scalar_t>
-static void upsample_nearest1d_out_frame(
-    scalar_t* odata,
-    scalar_t* idata,
-    int64_t input_width,
-    int64_t output_width,
-    int64_t nbatch,
-    int64_t channels,
-    c10::optional<double> scales) {
-  const float scale = compute_scales_value<float>(scales, input_width, output_width);
-  channels = channels * nbatch;
-
-  // special case: just copy
-  if (input_width == output_width) {
-    for (int64_t w2 = 0; w2 < output_width; ++w2) {
-      const int64_t w1 = w2;
-      const scalar_t* pos1 = &idata[w1];
-      scalar_t* pos2 = &odata[w2];
-
-      for (int64_t c = 0; c < channels; ++c) {
-        pos2[0] = pos1[0];
-        pos1 += input_width;
-        pos2 += output_width;
-      }
-    }
-    return;
-  }
-
-  for (int64_t w2 = 0; w2 < output_width; ++w2) {
-    const scalar_t src_x =
-        nearest_neighbor_compute_source_index(scale, w2, input_width);
-    const int64_t w1 = src_x;
-    const scalar_t* pos1 = &idata[w1];
-    scalar_t* pos2 = &odata[w2];
-
-    for (int64_t c = 0; c < channels; ++c) {
-      pos2[0] = pos1[0];
-      pos1 += input_width;
-      pos2 += output_width;
-    }
-  }
-}
-
-template <typename scalar_t>
-static void upsample_nearest1d_backward_out_frame(
-    scalar_t* odata,
-    scalar_t* idata,
-    int64_t input_width,
-    int64_t output_width,
-    int64_t nbatch,
-    int64_t channels,
-    c10::optional<double> scales) {
-  const float scale = compute_scales_value<float>(scales, input_width, output_width);
-  channels = channels * nbatch;
-
-  // special case: same-size matching grids
-  if (input_width == output_width) {
-    for (int64_t w2 = 0; w2 < output_width; ++w2) {
-      const int64_t w1 = w2;
-      scalar_t* pos1 = &idata[w1];
-      const scalar_t* pos2 = &odata[w2];
-
-      for (int64_t c = 0; c < channels; ++c) {
-        pos1[0] += pos2[0];
-        pos1 += input_width;
-        pos2 += output_width;
-      }
-    }
-    return;
-  }
-
-  for (int64_t w2 = 0; w2 < output_width; ++w2) {
-    const int64_t w1 =
-        nearest_neighbor_compute_source_index(scale, w2, input_width);
-
-    scalar_t* pos1 = &idata[w1];
-    const scalar_t* pos2 = &odata[w2];
-
-    for (int64_t c = 0; c < channels; ++c) {
-      pos1[0] += pos2[0];
-      pos1 += input_width;
-      pos2 += output_width;
-    }
-  }
-}
-
 static void upsample_nearest1d_out_cpu_template(
     Tensor& output,
-    const Tensor& input_,
+    const Tensor& input,
     IntArrayRef output_size,
     c10::optional<double> scales) {
   TORCH_CHECK(
@@ -104,43 +18,27 @@ static void upsample_nearest1d_out_cpu_template(
 
   int64_t output_width = output_size[0];
 
-  int64_t nbatch = input_.size(0);
-  int64_t channels = input_.size(1);
-  int64_t input_width = input_.size(2);
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_width = input.size(2);
 
   upsample_1d_shape_check(
-      input_,
+      input,
       Tensor(),
       nbatch,
       channels,
       input_width,
       output_width);
 
-  auto input = input_.contiguous();
-
   output.resize_({nbatch, channels, output_width});
-  output.zero_();
 
   AT_ASSERT(input_width > 0 && output_width > 0);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "upsample_nearest1d", [&] {
-    auto* idata = input.data_ptr<scalar_t>();
-    auto* odata = output.data_ptr<scalar_t>();
-
-    upsample_nearest1d_out_frame<scalar_t>(
-        odata,
-        idata,
-        input_width,
-        output_width,
-        nbatch,
-        channels,
-        scales);
-  });
+  upsample_nearest1d_kernel(kCPU, output, input, scales);
 }
 
 static void upsample_nearest1d_backward_out_cpu_template(
     Tensor& grad_input,
-    const Tensor& grad_output_,
+    const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
     c10::optional<double> scales) {
@@ -162,31 +60,16 @@ static void upsample_nearest1d_backward_out_cpu_template(
 
   upsample_1d_shape_check(
       Tensor(),
-      grad_output_,
+      grad_output,
       nbatch,
       channels,
       input_width,
       output_width);
 
-  auto grad_output = grad_output_.contiguous();
-
   grad_input.resize_({nbatch, channels, input_width});
   grad_input.zero_();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
-        scalar_t* idata = grad_input.data_ptr<scalar_t>();
-        scalar_t* odata = grad_output.data_ptr<scalar_t>();
-
-        upsample_nearest1d_backward_out_frame<scalar_t>(
-            odata,
-            idata,
-            input_width,
-            output_width,
-            nbatch,
-            channels,
-            scales);
-      });
+  upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales);
 }
 } // namespace
 
@@ -227,5 +110,8 @@ Tensor upsample_nearest1d_backward_cpu(
   return grad_input;
 }
 
+DEFINE_DISPATCH(upsample_nearest1d_kernel);
+DEFINE_DISPATCH(upsample_nearest1d_backward_kernel);
+
 } // namespace native
 } // namespace at