Skip to content

Commit

Permalink
optimize UpSampleNearest 1d 2d and 3d performance on CPU (pytorch#31452)
Browse files Browse the repository at this point in the history
Summary:
This PR aims at improving `UpSample` performance with `mode='nearest'` on 1D 2D and 3D, both inference and training are covered. Current implementation from 'ATen' doesn't have parallelization.

1. single socket inference speedup for 1d, 2d and 3d: **63x, 57x, 46x**.
2. single core inference speedup for 1d, 2d and 3d: **5.9x, 4.6x, 3.4x**.
3. dual sockets training speedup for 1d, 2d and 3d: **38x, 33x, 65x**
Pull Request resolved: pytorch#31452

Differential Revision: D20077828

Pulled By: VitalyFedyunin

fbshipit-source-id: a7815cf2ae344696067d2ec63bd4f4e858eaafff
  • Loading branch information
mingfeima authored and facebook-github-bot committed Mar 4, 2020
1 parent 112cecc commit 39f78db
Show file tree
Hide file tree
Showing 5 changed files with 352 additions and 492 deletions.
12 changes: 12 additions & 0 deletions aten/src/ATen/native/UpSample.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/native/DispatchStub.h>


/**
Expand Down Expand Up @@ -44,6 +45,17 @@
namespace at {
namespace native {

using scale_t = c10::optional<double>;
using upsampling_1d = void(*)(Tensor& output, const Tensor& input, scale_t scales_w);
using upsampling_2d = void(*)(Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
using upsampling_3d = void(*)(Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
DECLARE_DISPATCH(upsampling_1d, upsample_nearest1d_kernel);
DECLARE_DISPATCH(upsampling_2d, upsample_nearest2d_kernel);
DECLARE_DISPATCH(upsampling_3d, upsample_nearest3d_kernel);
DECLARE_DISPATCH(upsampling_1d, upsample_nearest1d_backward_kernel);
DECLARE_DISPATCH(upsampling_2d, upsample_nearest2d_backward_kernel);
DECLARE_DISPATCH(upsampling_3d, upsample_nearest3d_backward_kernel);

static inline void upsample_1d_shape_check(
const Tensor& input,
const Tensor& grad_output,
Expand Down
138 changes: 12 additions & 126 deletions aten/src/ATen/native/UpSampleNearest1d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,95 +6,9 @@ namespace at {
namespace native {
namespace {

template <typename scalar_t>
static void upsample_nearest1d_out_frame(
scalar_t* odata,
scalar_t* idata,
int64_t input_width,
int64_t output_width,
int64_t nbatch,
int64_t channels,
c10::optional<double> scales) {
const float scale = compute_scales_value<float>(scales, input_width, output_width);
channels = channels * nbatch;

// special case: just copy
if (input_width == output_width) {
for (int64_t w2 = 0; w2 < output_width; ++w2) {
const int64_t w1 = w2;
const scalar_t* pos1 = &idata[w1];
scalar_t* pos2 = &odata[w2];

for (int64_t c = 0; c < channels; ++c) {
pos2[0] = pos1[0];
pos1 += input_width;
pos2 += output_width;
}
}
return;
}

for (int64_t w2 = 0; w2 < output_width; ++w2) {
const scalar_t src_x =
nearest_neighbor_compute_source_index(scale, w2, input_width);
const int64_t w1 = src_x;
const scalar_t* pos1 = &idata[w1];
scalar_t* pos2 = &odata[w2];

for (int64_t c = 0; c < channels; ++c) {
pos2[0] = pos1[0];
pos1 += input_width;
pos2 += output_width;
}
}
}

template <typename scalar_t>
static void upsample_nearest1d_backward_out_frame(
scalar_t* odata,
scalar_t* idata,
int64_t input_width,
int64_t output_width,
int64_t nbatch,
int64_t channels,
c10::optional<double> scales) {
const float scale = compute_scales_value<float>(scales, input_width, output_width);
channels = channels * nbatch;

// special case: same-size matching grids
if (input_width == output_width) {
for (int64_t w2 = 0; w2 < output_width; ++w2) {
const int64_t w1 = w2;
scalar_t* pos1 = &idata[w1];
const scalar_t* pos2 = &odata[w2];

for (int64_t c = 0; c < channels; ++c) {
pos1[0] += pos2[0];
pos1 += input_width;
pos2 += output_width;
}
}
return;
}

for (int64_t w2 = 0; w2 < output_width; ++w2) {
const int64_t w1 =
nearest_neighbor_compute_source_index(scale, w2, input_width);

scalar_t* pos1 = &idata[w1];
const scalar_t* pos2 = &odata[w2];

for (int64_t c = 0; c < channels; ++c) {
pos1[0] += pos2[0];
pos1 += input_width;
pos2 += output_width;
}
}
}

static void upsample_nearest1d_out_cpu_template(
Tensor& output,
const Tensor& input_,
const Tensor& input,
IntArrayRef output_size,
c10::optional<double> scales) {
TORCH_CHECK(
Expand All @@ -104,43 +18,27 @@ static void upsample_nearest1d_out_cpu_template(

int64_t output_width = output_size[0];

int64_t nbatch = input_.size(0);
int64_t channels = input_.size(1);
int64_t input_width = input_.size(2);
int64_t nbatch = input.size(0);
int64_t channels = input.size(1);
int64_t input_width = input.size(2);

upsample_1d_shape_check(
input_,
input,
Tensor(),
nbatch,
channels,
input_width,
output_width);

auto input = input_.contiguous();

output.resize_({nbatch, channels, output_width});
output.zero_();

AT_ASSERT(input_width > 0 && output_width > 0);

AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "upsample_nearest1d", [&] {
auto* idata = input.data_ptr<scalar_t>();
auto* odata = output.data_ptr<scalar_t>();

upsample_nearest1d_out_frame<scalar_t>(
odata,
idata,
input_width,
output_width,
nbatch,
channels,
scales);
});
upsample_nearest1d_kernel(kCPU, output, input, scales);
}

static void upsample_nearest1d_backward_out_cpu_template(
Tensor& grad_input,
const Tensor& grad_output_,
const Tensor& grad_output,
IntArrayRef output_size,
IntArrayRef input_size,
c10::optional<double> scales) {
Expand All @@ -162,31 +60,16 @@ static void upsample_nearest1d_backward_out_cpu_template(

upsample_1d_shape_check(
Tensor(),
grad_output_,
grad_output,
nbatch,
channels,
input_width,
output_width);

auto grad_output = grad_output_.contiguous();

grad_input.resize_({nbatch, channels, input_width});
grad_input.zero_();

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
scalar_t* idata = grad_input.data_ptr<scalar_t>();
scalar_t* odata = grad_output.data_ptr<scalar_t>();

upsample_nearest1d_backward_out_frame<scalar_t>(
odata,
idata,
input_width,
output_width,
nbatch,
channels,
scales);
});
upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales);
}
} // namespace

Expand Down Expand Up @@ -227,5 +110,8 @@ Tensor upsample_nearest1d_backward_cpu(
return grad_input;
}

DEFINE_DISPATCH(upsample_nearest1d_kernel);
DEFINE_DISPATCH(upsample_nearest1d_backward_kernel);

} // namespace native
} // namespace at
Loading

0 comments on commit 39f78db

Please sign in to comment.