Merge pull request opencv#25630 from fengyuentau:nary-multi-thread

dnn: parallelize nary elementwise forward implementation & enable related conformance tests opencv#25630 This PR introduces the following changes: - [x] Parallelize binary forward impl - [x] Parallelize ternary forward impl (Where) - [x] Parallelize nary (Operator that can take >=1 operands) - [x] Enable conformance tests if workable ## Performance ### i7-12700K, RAM 64GB, Ubuntu 22.04 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.x64.0606 core.x64.0606 core.x64.0606 vs opencv perf core.x64.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 16.116 11.161 1.44 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 17.469 11.446 1.53 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 17.531 11.469 1.53 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 28.653 13.682 2.09 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 21.899 13.422 1.63 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 21.738 13.185 1.65 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 16.172 11.473 1.41 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 16.309 11.565 1.41 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 16.166 11.454 1.41 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 16.157 11.443 1.41 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 163.459 15.234 10.73 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 10.880 10.868 1.00 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 10.947 11.058 0.99 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 10.948 10.910 1.00 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 10.874 10.871 1.00 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 10.971 10.920 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 17.546 11.462 1.53 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 16.175 11.475 1.41 NHWC_C::Layer_NaryEltwise::OCV/CPU 11.339 11.333 1.00 NHWC_H::Layer_NaryEltwise::OCV/CPU 16.154 11.102 1.46 ``` ### Apple M1, RAM 16GB, macOS 14.4.1 ``` Geometric mean (ms) Name of Test opencv opencv opencv perf perf perf core.m1.0606 core.m1.0606.patch core.m1.0606.patch vs opencv perf core.m1.0606 (x-factor) NCHW_C_sum::Layer_NaryEltwise::OCV/CPU 28.418 3.768 7.54 NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU 6.942 5.679 1.22 NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU 5.822 5.653 1.03 NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU 5.751 5.628 1.02 NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU 5.797 5.599 1.04 NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU 7.272 5.578 1.30 NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU 5.777 5.562 1.04 NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU 5.819 5.559 1.05 NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU 5.830 5.574 1.05 NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU 5.759 5.567 1.03 NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU 342.260 74.655 4.58 NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU 8.338 8.280 1.01 NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU 8.359 8.309 1.01 NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU 8.412 8.295 1.01 NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU 8.380 8.297 1.01 NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU 8.356 8.323 1.00 NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU 6.818 5.561 1.23 NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU 5.805 5.570 1.04 NHWC_C::Layer_NaryEltwise::OCV/CPU 3.834 4.817 0.80 NHWC_H::Layer_NaryEltwise::OCV/CPU 28.402 3.771 7.53 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
jiayu-123 · Jul 3, 2024 · a7fd944 · a7fd944
1 parent a8d1373
commit a7fd944
Show file tree

Hide file tree

Showing 14 changed files with 563 additions and 404 deletions.
diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -350,6 +350,11 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     eltwise_op<T, FModFunctor<T>>(stream, output, x, y);
 }
 
+template <class T>
+void eltwise_pow_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, PowFunctor<T>>(stream, output, x, y);
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
     template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
@@ -360,6 +365,7 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_pow_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
 #endif
     template void eltwise_mod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_fmod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
@@ -370,5 +376,6 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_pow_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
 
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
@@ -833,6 +833,21 @@ struct FModFunctor {
     }
 };
 
+template <class T>
+struct PowFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE PowFunctor() { }
+    CUDA4DNN_DEVICE PowFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::pow;
+        return pow(x, y);
+    }
+};
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@@ -39,6 +39,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
 
+    template <class T>
+    void eltwise_pow_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@@ -30,6 +30,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         SUB,
         MOD,
         FMOD,
+        POW,
     };
 
     class EltwiseOpBase : public CUDABackendNode {
@@ -62,7 +63,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
             csl::Workspace& workspace) override
         {
-            CV_Assert(inputs.size() >= 2);
             CV_Assert(outputs.size() == 1);
 
             CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM);
@@ -94,9 +94,13 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                 case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
                 case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, input_x, input_y); break;
                 case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::POW: kernels::eltwise_pow_2<T>(stream, output, input_x, input_y); break;
                 }
-            }
-            else
+            } else if (inputs.size() == 1) {
+                auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
+                auto input_0 = input_wrapper_0->getView();
+                csl::tensor_ops::copy(stream, output, input_0);
+            } else
             {
                 auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
                 auto input_0 = input_wrapper_0->getView();
@@ -128,6 +132,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                     case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
                     case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, output, input); break;
                     case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::POW: kernels::eltwise_pow_2<T>(stream, output, output, input); break;
                     }
                 }
             }