Skip to content

Commit

Permalink
Merge pull request opencv#25630 from fengyuentau:nary-multi-thread
Browse files Browse the repository at this point in the history
dnn: parallelize nary elementwise forward implementation & enable related conformance tests opencv#25630

This PR introduces the following changes:

- [x] Parallelize binary forward impl
- [x] Parallelize ternary forward impl (Where)
- [x] Parallelize nary (Operator that can take >=1 operands)
- [x] Enable conformance tests if workable

## Performance

### i7-12700K, RAM 64GB, Ubuntu 22.04

```
Geometric mean (ms)

                Name of Test                     opencv        opencv        opencv
                                                  perf          perf          perf
                                              core.x64.0606 core.x64.0606 core.x64.0606
                                                                               vs
                                                                             opencv
                                                                              perf
                                                                          core.x64.0606
                                                                           (x-factor)
NCHW_C_sum::Layer_NaryEltwise::OCV/CPU           16.116        11.161         1.44
NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU        17.469        11.446         1.53
NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU        17.531        11.469         1.53
NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU      28.653        13.682         2.09
NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU    21.899        13.422         1.63
NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU       21.738        13.185         1.65
NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU        16.172        11.473         1.41
NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU       16.309        11.565         1.41
NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU        16.166        11.454         1.41
NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU        16.157        11.443         1.41
NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU        163.459       15.234         10.73
NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU    10.880        10.868         1.00
NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU    10.947        11.058         0.99
NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU    10.948        10.910         1.00
NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU    10.874        10.871         1.00
NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU    10.971        10.920         1.00
NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU        17.546        11.462         1.53
NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU        16.175        11.475         1.41
NHWC_C::Layer_NaryEltwise::OCV/CPU               11.339        11.333         1.00
NHWC_H::Layer_NaryEltwise::OCV/CPU               16.154        11.102         1.46
```

### Apple M1, RAM 16GB, macOS 14.4.1

```
Geometric mean (ms)

                Name of Test                     opencv          opencv             opencv      
                                                  perf            perf               perf       
                                              core.m1.0606 core.m1.0606.patch core.m1.0606.patch
                                                                                      vs        
                                                                                    opencv      
                                                                                     perf       
                                                                                 core.m1.0606   
                                                                                  (x-factor)    
NCHW_C_sum::Layer_NaryEltwise::OCV/CPU           28.418          3.768               7.54       
NCHW_NCHW_add::Layer_NaryEltwise::OCV/CPU        6.942           5.679               1.22       
NCHW_NCHW_div::Layer_NaryEltwise::OCV/CPU        5.822           5.653               1.03       
NCHW_NCHW_equal::Layer_NaryEltwise::OCV/CPU      5.751           5.628               1.02       
NCHW_NCHW_greater::Layer_NaryEltwise::OCV/CPU    5.797           5.599               1.04       
NCHW_NCHW_less::Layer_NaryEltwise::OCV/CPU       7.272           5.578               1.30       
NCHW_NCHW_max::Layer_NaryEltwise::OCV/CPU        5.777           5.562               1.04       
NCHW_NCHW_mean::Layer_NaryEltwise::OCV/CPU       5.819           5.559               1.05       
NCHW_NCHW_min::Layer_NaryEltwise::OCV/CPU        5.830           5.574               1.05       
NCHW_NCHW_mul::Layer_NaryEltwise::OCV/CPU        5.759           5.567               1.03       
NCHW_NCHW_pow::Layer_NaryEltwise::OCV/CPU       342.260          74.655              4.58       
NCHW_NCHW_ref_div::Layer_NaryEltwise::OCV/CPU    8.338           8.280               1.01       
NCHW_NCHW_ref_max::Layer_NaryEltwise::OCV/CPU    8.359           8.309               1.01       
NCHW_NCHW_ref_min::Layer_NaryEltwise::OCV/CPU    8.412           8.295               1.01       
NCHW_NCHW_ref_mul::Layer_NaryEltwise::OCV/CPU    8.380           8.297               1.01       
NCHW_NCHW_ref_sum::Layer_NaryEltwise::OCV/CPU    8.356           8.323               1.00       
NCHW_NCHW_sub::Layer_NaryEltwise::OCV/CPU        6.818           5.561               1.23       
NCHW_NCHW_sum::Layer_NaryEltwise::OCV/CPU        5.805           5.570               1.04       
NHWC_C::Layer_NaryEltwise::OCV/CPU               3.834           4.817               0.80       
NHWC_H::Layer_NaryEltwise::OCV/CPU               28.402          3.771               7.53
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
  • Loading branch information
fengyuentau authored Jul 3, 2024
1 parent a8d1373 commit a7fd944
Show file tree
Hide file tree
Showing 14 changed files with 563 additions and 404 deletions.
7 changes: 7 additions & 0 deletions modules/dnn/src/cuda/eltwise_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
eltwise_op<T, FModFunctor<T>>(stream, output, x, y);
}

template <class T>
void eltwise_pow_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, PowFunctor<T>>(stream, output, x, y);
}

#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
Expand All @@ -360,6 +365,7 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_pow_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
#endif
template void eltwise_mod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_fmod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
Expand All @@ -370,5 +376,6 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_pow_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);

}}}} /* namespace cv::dnn::cuda4dnn::kernels */
15 changes: 15 additions & 0 deletions modules/dnn/src/cuda/functors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,21 @@ struct FModFunctor {
}
};

template <class T>
struct PowFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() {}
};

CUDA4DNN_DEVICE PowFunctor() { }
CUDA4DNN_DEVICE PowFunctor(const Params& params) { }

CUDA4DNN_DEVICE T operator()(T x, T y) {
using csl::device::pow;
return pow(x, y);
}
};

}}}} /* namespace cv::dnn::cuda4dnn::kernels */

#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
3 changes: 3 additions & 0 deletions modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);

template <class T>
void eltwise_pow_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);

}}}} /* namespace cv::dnn::cuda4dnn::kernels */

#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
11 changes: 8 additions & 3 deletions modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
SUB,
MOD,
FMOD,
POW,
};

class EltwiseOpBase : public CUDABackendNode {
Expand Down Expand Up @@ -62,7 +63,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
csl::Workspace& workspace) override
{
CV_Assert(inputs.size() >= 2);
CV_Assert(outputs.size() == 1);

CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM);
Expand Down Expand Up @@ -94,9 +94,13 @@ namespace cv { namespace dnn { namespace cuda4dnn {
case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, input_x, input_y); break;
case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, input_x, input_y); break;
case EltwiseOpType::POW: kernels::eltwise_pow_2<T>(stream, output, input_x, input_y); break;
}
}
else
} else if (inputs.size() == 1) {
auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
auto input_0 = input_wrapper_0->getView();
csl::tensor_ops::copy(stream, output, input_0);
} else
{
auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
auto input_0 = input_wrapper_0->getView();
Expand Down Expand Up @@ -128,6 +132,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, output, input); break;
case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, output, input); break;
case EltwiseOpType::POW: kernels::eltwise_pow_2<T>(stream, output, output, input); break;
}
}
}
Expand Down
Loading

0 comments on commit a7fd944

Please sign in to comment.