Merge remote-tracking branch 'upstream/3.4' into merge-3.4

tigertong · Sep 25, 2019 · e2a5a6a · e2a5a6a
2 parents 9f4f900 + 7ce9428
commit e2a5a6a
Show file tree

Hide file tree

Showing 50 changed files with 5,957 additions and 163 deletions.
diff --git a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
@@ -107,6 +107,90 @@ Building OpenCV.js from Source
     @note
     It requires `node` installed in your development environment.
 
+-#  [optional] To build `opencv.js` with threads optimization, append `--threads` option.
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --build_wasm --threads
+    @endcode
+
+    The default threads number is the logic core number of your device. You can use `cv.parallel_pthreads_set_threads_num(number)` to set threads number by yourself and use `cv.parallel_pthreads_get_threads_num()` to get the current threads number.
+
+    @note
+    You should build wasm version of `opencv.js` if you want to enable this optimization. And the threads optimization only works in browser, not in node.js. You need to enable the `WebAssembly threads support` feature first with your browser. For example, if you use Chrome, please enable this flag in chrome://flags.
+
+-#  [optional] To build `opencv.js` with wasm simd optimization, append `--simd` option.
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --build_wasm --simd
+    @endcode
+
+    The simd optimization is experimental as wasm simd is still in development.
+
+    @note
+    Now only emscripten LLVM upstream backend supports wasm simd, refering to https://emscripten.org/docs/porting/simd.html. So you need to setup upstream backend environment with the following command first:
+    @code{.bash}
+    ./emsdk update
+    ./emsdk install latest-upstream
+    ./emsdk activate latest-upstream
+    source ./emsdk_env.sh
+    @endcode
+
+    @note
+    You should build wasm version of `opencv.js` if you want to enable this optimization. For browser, you need to enable the `WebAssembly SIMD support` feature first. For example, if you use Chrome, please enable this flag in chrome://flags. For Node.js, you need to run script with flag `--experimental-wasm-simd`.
+
+    @note
+    The simd version of `opencv.js` built by latest LLVM upstream may not work with the stable browser or old version of Node.js. Please use the latest version of unstable browser or Node.js to get new features, like `Chrome Dev`.
+
+-#  [optional] To build wasm intrinsics tests, append `--build_wasm_intrin_test` option.
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --build_wasm --simd --build_wasm_intrin_test
+    @endcode
+
+    For wasm intrinsics tests, you can use the following function to test all the cases:
+    @code{.js}
+    cv.test_hal_intrin_all()
+    @endcode
+
+    And the failed cases will be logged in the JavaScript debug console.
+
+    If you only want to test single data type of wasm intrinsics, you can use the following functions:
+    @code{.js}
+    cv.test_hal_intrin_uint8()
+    cv.test_hal_intrin_int8()
+    cv.test_hal_intrin_uint16()
+    cv.test_hal_intrin_int16()
+    cv.test_hal_intrin_uint32()
+    cv.test_hal_intrin_int32()
+    cv.test_hal_intrin_uint64()
+    cv.test_hal_intrin_int64()
+    cv.test_hal_intrin_float32()
+    cv.test_hal_intrin_float64()
+    @endcode
+
+-#  [optional] To build performance tests, append `--build_perf` option.
+
+    For example:
+    @code{.bash}
+    python ./platforms/js/build_js.py build_js --build_perf
+    @endcode
+
+    To run performance tests, launch a local web server in \<build_dir\>/bin folder. For example, node http-server which serves on `localhost:8080`.
+
+    There are some kernels now in the performance test like `cvtColor`, `resize` and `threshold`. For example, if you want to test `threshold`, please navigate the web browser to `http://localhost:8080/perf/perf_imgproc/perf_threshold.html`. You need to input the test parameter like `(1920x1080, CV_8UC1, THRESH_BINARY)`, and then click the `Run` button to run the case. And if you don't input the parameter, it will run all the cases of this kernel.
+
+    You can also run tests using Node.js.
+
+    For example, run `threshold` with parameter `(1920x1080, CV_8UC1, THRESH_BINARY)`:
+    @code{.sh}
+    cd bin/perf
+    npm install
+    node perf_threshold.js --test_param_filter="(1920x1080, CV_8UC1, THRESH_BINARY)"
+    @endcode
+
 Building OpenCV.js with Docker
 ---------------------------------------
 

diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -157,6 +157,11 @@
 #  define CV_MSA 1
 #endif
 
+#ifdef __EMSCRIPTEN__
+#  define CV_WASM_SIMD 1
+#  include <wasm_simd128.h>
+#endif
+
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
 
 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@@ -328,3 +333,7 @@ struct VZeroUpperGuard {
 #ifndef CV_MSA
 #  define CV_MSA 0
 #endif
+
+#ifndef CV_WASM_SIMD
+#  define CV_WASM_SIMD 0
+#endif
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
@@ -695,6 +695,7 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 #endif
 
 #define CV_CXX_MOVE_SEMANTICS 1
+#define CV_CXX_MOVE(x) std::move(x)
 #define CV_CXX_STD_ARRAY 1
 #include <array>
 #ifndef CV_OVERRIDE

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -168,7 +168,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_MSA
 #endif
 
-#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA
+#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -190,6 +190,9 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #include "opencv2/core/hal/intrin_msa.hpp"
 
+#elif CV_WASM_SIMD
+#include "opencv2/core/hal/intrin_wasm.hpp"
+
 #else
 
 #define CV_SIMD128_CPP 1

diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1241,6 +1241,11 @@ inline int v_signmask(const v_int32x8& a)
 inline int v_signmask(const v_uint32x8& a)
 { return v_signmask(v_reinterpret_as_f32(a)); }
 
+inline int v_signmask(const v_int64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+
 inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
 inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
 inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
@@ -1253,40 +1258,23 @@ inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signma
 inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
 
 /** Checks **/
-#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
-    inline bool v_check_all(const _Tpvec& a)                \
-    {                                                       \
-        int mask = v_signmask(v_reinterpret_as_s8(a));      \
-        return and_op(mask, allmask) == allmask;            \
-    }                                                       \
-    inline bool v_check_any(const _Tpvec& a)                \
-    {                                                       \
-        int mask = v_signmask(v_reinterpret_as_s8(a));      \
-        return and_op(mask, allmask) != 0;                  \
-    }
-
-OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32,  OPENCV_HAL_1ST, -1)
-OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32,   OPENCV_HAL_1ST, -1)
-OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaaaaaa)
-OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16,  OPENCV_HAL_AND, (int)0xaaaaaaaa)
-OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8,  OPENCV_HAL_AND, (int)0x88888888)
-OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8,   OPENCV_HAL_AND, (int)0x88888888)
-
-#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
-    inline bool v_check_all(const _Tpvec& a)           \
-    {                                                  \
-        int mask = v_signmask(a);                      \
-        return mask == allmask;                        \
-    }                                                  \
-    inline bool v_check_any(const _Tpvec& a)           \
-    {                                                  \
-        int mask = v_signmask(a);                      \
-        return mask != 0;                              \
-    }
-
-OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
-OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
-
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
 
 ////////// Other math /////////
 

diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1080,7 +1080,7 @@ Returned value is a bit mask with bits set to 1 on places corresponding to negat
 v_int32x4 r; // set to {-1, -1, 1, 1}
 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
 @endcode
-For all types except 64-bit. */
+*/
 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
 {
     int mask = 0;
@@ -1109,7 +1109,7 @@ template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
 /** @brief Check if all packed values are less than zero
 
 Unsigned values will be casted to signed: `uchar 254 => char -2`.
-For all types except 64-bit. */
+*/
 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
 {
     for( int i = 0; i < n; i++ )
@@ -1121,7 +1121,7 @@ template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
 /** @brief Check if any of packed values is less than zero
 
 Unsigned values will be casted to signed: `uchar 254 => char -2`.
-For all types except 64-bit. */
+*/
 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
 {
     for( int i = 0; i < n; i++ )

diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1139,9 +1139,17 @@ inline bool v_check_any(const v_##_Tpvec& a) \
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
-#endif
+
+inline bool v_check_all(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
+}
+inline bool v_check_any(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
+}
 
 inline bool v_check_all(const v_int8x16& a)
 { return v_check_all(v_reinterpret_as_u8(a)); }
@@ -1161,13 +1169,13 @@ inline bool v_check_any(const v_int32x4& a)
 inline bool v_check_any(const v_float32x4& a)
 { return v_check_any(v_reinterpret_as_u32(a)); }
 
-#if CV_SIMD128_64F
 inline bool v_check_all(const v_int64x2& a)
 { return v_check_all(v_reinterpret_as_u64(a)); }
-inline bool v_check_all(const v_float64x2& a)
-{ return v_check_all(v_reinterpret_as_u64(a)); }
 inline bool v_check_any(const v_int64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
 inline bool v_check_any(const v_float64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
 #endif

diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1591,31 +1591,25 @@ inline v_uint32x4 v_popcount(const v_int32x4& a)
 inline v_uint64x2 v_popcount(const v_int64x2& a)
 { return v_popcount(v_reinterpret_as_u64(a)); }
 
-#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
-inline int v_signmask(const _Tpvec& a) \
-{ \
-    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
-} \
-inline bool v_check_all(const _Tpvec& a) \
-{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
-inline bool v_check_any(const _Tpvec& a) \
-{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
-
-#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
-inline __m128i v_packq_epi32(__m128i a)
-{
-    __m128i b = _mm_packs_epi32(a, a);
-    return _mm_packs_epi16(b, b);
-}
-
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
+inline int v_signmask(const _Tpvec& a)   { return _mm_movemask_##suffix(cast_op(a.val)); } \
+inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
+inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
+inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
 
 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }

diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -899,6 +899,8 @@ inline bool v_check_all(const v_uint16x8& a)
 { return v_check_all(v_reinterpret_as_s16(a)); }
 inline bool v_check_all(const v_uint32x4& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
 inline bool v_check_all(const v_float32x4& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
 inline bool v_check_all(const v_float64x2& a)
@@ -913,6 +915,8 @@ inline bool v_check_any(const v_uint16x8& a)
 { return v_check_any(v_reinterpret_as_s16(a)); }
 inline bool v_check_any(const v_uint32x4& a)
 { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
 inline bool v_check_any(const v_float32x4& a)
 { return v_check_any(v_reinterpret_as_s32(a)); }
 inline bool v_check_any(const v_float64x2& a)