From a3106d424b32b6c6b810f05db5790909a347f539 Mon Sep 17 00:00:00 2001 From: YashasSamaga Date: Sun, 2 Aug 2020 12:44:35 +0530 Subject: [PATCH] add MVNOp --- modules/dnn/src/cuda/mvn.cu | 145 ++++++++++++++++++++ modules/dnn/src/cuda4dnn/kernels/mvn.hpp | 31 +++++ modules/dnn/src/cuda4dnn/primitives/mvn.hpp | 134 ++++++++++++++++++ modules/dnn/src/layers/mvn_layer.cpp | 33 ++++- 4 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda/mvn.cu create mode 100644 modules/dnn/src/cuda4dnn/kernels/mvn.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/mvn.hpp diff --git a/modules/dnn/src/cuda/mvn.cu b/modules/dnn/src/cuda/mvn.cu new file mode 100644 index 000000000000..adf997c0b0c6 --- /dev/null +++ b/modules/dnn/src/cuda/mvn.cu @@ -0,0 +1,145 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "math.hpp" +#include "types.hpp" +#include "atomics.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +#include + +#include + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + template + __global__ void reduce_mean(Span means, View input, size_type inner_size) { + for (auto idx : grid_stride_range(input.size())) { + const index_type outer_idx = idx / inner_size; + atomicAdd(&means[outer_idx], static_cast(input[idx]) / inner_size); + } + } + + template + __global__ void reduce_mean_sqr_sum(Span means, Span sum_sqrs, View input, size_type inner_size) { + for (auto idx : grid_stride_range(input.size())) { + const index_type outer_idx = idx / inner_size; + auto x = static_cast(input[idx]); + atomicAdd(&means[outer_idx], x / inner_size); + atomicAdd(&sum_sqrs[outer_idx], x * x); + } + } + + __global__ void compute_normalization_scale(Span scale, View means, View sums_sqr, size_type inner_size, float eps) { + for (auto idx : grid_stride_range(scale.size())) { + auto mean = means[idx]; + auto var = sums_sqr[idx] / inner_size - mean * mean; + using device::rsqrt; + scale[idx] = rsqrt(eps + var); + } + } + + template + __global__ void normalize_mean(Span output, View input, View means, size_type inner_size) { + for (auto idx : grid_stride_range(output.size())) { + const index_type outer_idx = idx / inner_size; + output[idx] = static_cast(input[idx]) - means[outer_idx]; + } + } + + template + __global__ void normalize_mean_variance(Span output, View input, View means, View scale, size_type inner_size) { + for (auto idx : grid_stride_range(output.size())) { + const index_type outer_idx = idx / inner_size; + output[idx] = (static_cast(input[idx]) - means[outer_idx]) * scale[outer_idx]; + } + } +} + +template +void reduce_mean(const Stream& stream, Span means, View input, std::size_t inner_size) +{ + CV_Assert(input.size() / inner_size == means.size()); + + auto kernel = raw::reduce_mean; + auto policy = make_policy(kernel, input.size(), 0, stream); + launch_kernel(kernel, policy, means, input, inner_size); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void reduce_mean(const Stream&, Span, View<__half>, std::size_t); +#endif +template void reduce_mean(const Stream&, Span, View, std::size_t); + +template +void reduce_mean_sqr_sum(const Stream& stream, Span means, Span sum_sqrs, View input, std::size_t inner_size) +{ + CV_Assert(input.size() / inner_size == means.size()); + CV_Assert(input.size() / inner_size == sum_sqrs.size()); + + auto kernel = raw::reduce_mean_sqr_sum; + auto policy = make_policy(kernel, input.size(), 0, stream); + launch_kernel(kernel, policy, means, sum_sqrs, input, inner_size); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void reduce_mean_sqr_sum(const Stream&, Span, Span, View<__half>, std::size_t); +#endif +template void reduce_mean_sqr_sum(const Stream&, Span, Span, View, std::size_t); + +void compute_normalization_scale(const Stream& stream, Span scale, View means, View sum_sqrs, std::size_t inner_size, float eps) +{ + CV_Assert(scale.size() == means.size()); + CV_Assert(scale.size() == sum_sqrs.size()); + + auto kernel = raw::compute_normalization_scale; + auto policy = make_policy(kernel, scale.size(), 0, stream); + launch_kernel(kernel, policy, scale, means, sum_sqrs, inner_size, eps); +} + +template +void normalize_mean(const Stream& stream, Span output, View input, View means, std::size_t inner_size) +{ + CV_Assert(output.size() == input.size()); + CV_Assert(input.size() / inner_size == means.size()); + + auto kernel = raw::normalize_mean; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, means, inner_size); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void normalize_mean(const Stream&, Span<__half>, View<__half>, View, std::size_t); +#endif +template void normalize_mean(const Stream&, Span, View, View, std::size_t); + +template +void normalize_mean_variance(const Stream& stream, Span output, View input, View means, View scale, std::size_t inner_size) +{ + CV_Assert(input.size() == output.size()); + CV_Assert(input.size() / inner_size == means.size()); + CV_Assert(input.size() / inner_size == scale.size()); + + auto kernel = raw::normalize_mean_variance; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, means, scale, inner_size); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>, View, View, std::size_t); +#endif +template void normalize_mean_variance(const Stream&, Span, View, View, View, std::size_t); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp new file mode 100644 index 000000000000..b5a573e92122 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp @@ -0,0 +1,31 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +template +void reduce_mean(const csl::Stream& stream, csl::Span means, csl::View input, std::size_t inner_size); + +template +void reduce_mean_sqr_sum(const csl::Stream& stream, csl::Span means, csl::Span sum_sqrs, csl::View input, std::size_t inner_size); + +void compute_normalization_scale(const csl::Stream& stream, csl::Span scale, csl::View means, csl::View sum_sqrs, std::size_t inner_size, float eps); + +template +void normalize_mean(const csl::Stream& stream, csl::Span output, csl::View input, csl::View means, std::size_t inner_size); + +template +void normalize_mean_variance(const csl::Stream& stream, csl::Span output, csl::View input, csl::View means, csl::View scale, std::size_t inner_size); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/mvn.hpp b/modules/dnn/src/cuda4dnn/primitives/mvn.hpp new file mode 100644 index 000000000000..a077521ab3bb --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/mvn.hpp @@ -0,0 +1,134 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" +#include "../csl/tensor.hpp" +#include "../csl/workspace.hpp" + +#include "../kernels/fill_copy.hpp" +#include "../kernels/mvn.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + struct MVNConfiguration { + std::vector> input_shapes; + + /* + * [0, split_axis) = outer range + * [split_axis, -1] = inner range + * + * for each location in the outer range, all the values in the inner range are normalized as a group + */ + std::size_t split_axis; + + /* The group (described above) is centered always. The following parameter controls whether the variance + * is also normalized. + */ + bool normalize_variance; + float epsilon; + }; + + template + class MVNOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + MVNOp(csl::Stream stream_, const MVNConfiguration& config) + : stream(std::move(stream_)) + { + split_axis = config.split_axis; + normalize_variance = config.normalize_variance; + epsilon = config.epsilon; + + std::size_t max_outer_size = 0; + const auto& input_shapes = config.input_shapes; + for (int i = 0; i < input_shapes.size(); i++) + { + std::size_t outer_size = 1; + for (int j = 0; j < split_axis; j++) + outer_size *= input_shapes[i][j]; + max_outer_size = std::max(max_outer_size, outer_size); + } + + csl::WorkspaceBuilder builder; + builder.require(max_outer_size); + if (normalize_variance) + builder.require(max_outer_size); + scratch_mem_in_bytes = builder.required_workspace_size(); + } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == outputs.size()); + + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto outer_size = input.size_range(0, split_axis); + auto inner_size = input.size_range(split_axis, input.rank()); + if (inner_size == 1) + { + kernels::fill(stream, output, 0.0f); + return; + } + else + { + auto ws_allocator = csl::WorkspaceAllocator(workspace); + + auto means = ws_allocator.get_span(outer_size); + kernels::fill(stream, means, 0); + + if (normalize_variance) + { + auto scales = ws_allocator.get_span(outer_size); + kernels::fill(stream, scales, 0); + + kernels::reduce_mean_sqr_sum(stream, means, scales, input, inner_size); + kernels::compute_normalization_scale(stream, scales, means, scales, inner_size, epsilon); + kernels::normalize_mean_variance(stream, output, input, means, scales, inner_size); + } + else + { + kernels::reduce_mean(stream, means, input, inner_size); + kernels::normalize_mean(stream, output, input, means, inner_size); + } + } + } + } + + std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } + + private: + csl::Stream stream; + + bool normalize_variance; + float epsilon; + std::size_t split_axis; + + std::size_t scratch_mem_in_bytes; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP */ diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 386446e18f8f..4a096ce19cad 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -44,6 +44,7 @@ #include "layers_common.hpp" #include "../op_inf_engine.hpp" #include "../ie_ngraph.hpp" +#include "../op_cuda.hpp" #include @@ -52,6 +53,11 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/primitives/mvn.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -127,7 +133,7 @@ class MVNLayerImpl CV_FINAL : public MVNLayer return true; #endif { - return backendId == DNN_BACKEND_OPENCV; + return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA; } } @@ -399,6 +405,31 @@ class MVNLayerImpl CV_FINAL : public MVNLayer } #endif // HAVE_DNN_NGRAPH +#ifdef HAVE_CUDA + Ptr initCUDA( + void *context_, + const std::vector>& inputs, + const std::vector>& outputs + ) override + { + auto context = reinterpret_cast(context_); + + cuda4dnn::MVNConfiguration config; + config.split_axis = acrossChannels ? 1 : 2; + config.normalize_variance = normVariance; + config.epsilon = eps; + config.input_shapes.resize(inputs.size()); + for (int i = 0; i < inputs.size(); i++) + { + auto wrapper = inputs[i].dynamicCast(); + auto shape = wrapper->getShape(); + config.input_shapes[i].assign(std::begin(shape), std::end(shape)); + } + + return make_cuda_node(preferableTarget, std::move(context->stream), config); + } +#endif + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE {