Merge pull request opencv#25872 from fengyuentau:core/v_erf

core: add v_erf opencv#25872 This patch adds v_erf, which is needed by opencv#25147. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
jiayu-123 · Jul 5, 2024 · d30b945 · d30b945
1 parent 88b28ee
commit d30b945
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 1 deletion.
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -263,7 +263,8 @@ Most of these operations return only one value.
 
 ### Other math
 
-- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp,
+                            @ref v_erf
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 
 ### Conversions
@@ -761,6 +762,13 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
 #define OPENCV_HAL_MATH_HAVE_LOG 1
 
+/**
+ * @brief Error function.
+ *
+ * @note Support FP32 precision for now.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
+
 //! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
 #define OPENCV_HAL_MATH_HAVE_SIN 1

diff --git a/modules/core/include/opencv2/core/hal/intrin_math.hpp b/modules/core/include/opencv2/core/hal/intrin_math.hpp
@@ -418,5 +418,50 @@ namespace CV__SIMD_NAMESPACE {
 #define OPENCV_HAL_MATH_HAVE_LOG 1
 //! @}
 #endif
+
+/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
+   https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
+*/
+
+#ifndef OPENCV_HAL_MATH_HAVE_ERF
+
+//! @name Error Function
+//! @{
+
+    inline v_float32 v_erf(v_float32 v) {
+        const v_float32 coef0 = vx_setall_f32(0.3275911f),
+                        coef1 = vx_setall_f32(1.061405429f),
+                        coef2 = vx_setall_f32(-1.453152027f),
+                        coef3 = vx_setall_f32(1.421413741f),
+                        coef4 = vx_setall_f32(-0.284496736f),
+                        coef5 = vx_setall_f32(0.254829592f),
+                        ones = vx_setall_f32(1.0f),
+                        neg_zeros = vx_setall_f32(-0.f);
+        v_float32 t = v_abs(v);
+        // sign(v)
+        v_float32 sign_mask = v_and(neg_zeros, v);
+
+        t = v_div(ones, v_fma(coef0, t, ones));
+        v_float32 r = v_fma(coef1, t, coef2);
+        r = v_fma(r, t, coef3);
+        r = v_fma(r, t, coef4);
+        r = v_fma(r, t, coef5);
+        // - v * v
+        v_float32 pow_2 = v_mul(v, v);
+        v_float32 neg_pow_2 = v_xor(neg_zeros, pow_2);
+        // - exp(- v * v)
+        v_float32 exp = v_exp(neg_pow_2);
+        v_float32 neg_exp = v_xor(neg_zeros, exp);
+        v_float32 res = v_mul(t, neg_exp);
+        res = v_fma(r, res, ones);
+        return v_xor(sign_mask, res);
+    }
+
+#define OPENCV_HAL_MATH_HAVE_ERF 1
+//! @}
+
+#endif // OPENCV_HAL_MATH_HAVE_ERF
+
+
 }
 #endif  // OPENCV_HAL_INTRIN_HPP
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
@@ -1864,6 +1864,48 @@ template<typename R> struct TheTest
 #endif
         return *this;
     }
+
+    TheTest &test_erf_fp32() {
+        int n = VTraits<R>::vlanes();
+
+        constexpr int num_loops = 10000;
+        const std::vector<LaneType> singular_inputs{INFINITY, -INFINITY, NAN};
+        constexpr double insert_singular_input_probability = 0.1;
+        cv::RNG_MT19937 rng;
+
+        for (int i = 0; i < num_loops; i++) {
+            Data<R> inputs;
+            for (int j = 0; j < n; j++) {
+                if (rng.uniform(0.f, 1.f) <= insert_singular_input_probability) {
+                    int singular_input_index = rng.uniform(0, int(singular_inputs.size()));
+                    inputs[j] = singular_inputs[singular_input_index];
+                } else {
+                    // std::exp(float) overflows at about 88.0f.
+                    // In v_erf, exp is called on input*input. So test range is [-sqrt(88.0f), sqrt(88.0f)]
+                    inputs[j] = (LaneType) rng.uniform(-9.4f, 9.4f);
+                }
+            }
+
+            Data<R> outputs = v_erf(R(inputs));
+            for (int j = 0; j < n; j++) {
+                SCOPED_TRACE(cv::format("Random test value: %f", inputs[j]));
+                if (std::isinf(inputs[j])) {
+                    if (inputs[j] < 0) {
+                        EXPECT_EQ(-1, outputs[j]);
+                    } else {
+                        EXPECT_EQ(1, outputs[j]);
+                    }
+                } else if (std::isnan(inputs[j])) {
+                    EXPECT_TRUE(std::isnan(outputs[j]));
+                } else {
+                    LaneType ref_output = std::erf(inputs[j]);
+                    EXPECT_LT(std::abs(outputs[j] - ref_output), 1e-3f * (std::abs(ref_output) + FLT_MIN * 1e4f));
+                }
+            }
+        }
+
+        return *this;
+    }
 };
 
 #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
@@ -2179,6 +2221,7 @@ void test_hal_intrin_float32()
         .test_pack_triplets()
         .test_exp_fp32()
         .test_log_fp32()
+        .test_erf_fp32()
 #if CV_SIMD_WIDTH == 32
         .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
         .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()