Skip to content

Commit 751cee8

Browse files
committed
Merge pull request opencv#9907 from seiko2plus:vsxFixesImproves
2 parents 0608227 + 2dc76d5 commit 751cee8

File tree

6 files changed

+555
-261
lines changed

6 files changed

+555
-261
lines changed

modules/core/include/opencv2/core/hal/intrin_vsx.hpp

+107-57
Original file line numberDiff line numberDiff line change
@@ -523,24 +523,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
523523
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
524524

525525
/** Bitwise shifts **/
526-
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \
527-
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
528-
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
529-
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
530-
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \
531-
template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
532-
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
533-
template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
534-
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
535-
536-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp)
537-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp)
538-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp)
539-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp)
540-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp)
541-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp)
542-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp)
543-
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp)
526+
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
527+
inline _Tpvec operator << (const _Tpvec& a, int imm) \
528+
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
529+
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
530+
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
531+
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
532+
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
533+
template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
534+
{ return _Tpvec(shr(a.val, splfunc(imm))); }
535+
536+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
537+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
538+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
539+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
540+
// algebraic right shift
541+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
542+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
543+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
544+
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
544545

545546
/** Bitwise logic **/
546547
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
@@ -605,6 +606,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
605606
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
606607
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
607608

609+
/** Rotate **/
610+
#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
611+
template<int imm> \
612+
inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
613+
{ \
614+
const int wd = imm * sizeof(typename _Tpvec::lane_type); \
615+
if (wd > 15) \
616+
return _Tpvec(); \
617+
return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
618+
}
619+
620+
#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
621+
OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
622+
OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
623+
624+
OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
625+
OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
626+
OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
627+
OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
628+
OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
629+
OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
630+
OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
631+
OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
632+
633+
634+
template<int imm, typename _Tpvec>
635+
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
636+
{
637+
const int wd = imm * sizeof(typename _Tpvec::lane_type);
638+
if (wd == 0)
639+
return a;
640+
return _Tpvec(vec_sld(b.val, a.val, 16 - wd));
641+
}
642+
643+
template<int imm, typename _Tpvec>
644+
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
645+
{
646+
const int wd = imm * sizeof(typename _Tpvec::lane_type);
647+
if (wd == 16)
648+
return b;
649+
return _Tpvec(vec_sld(a.val, b.val, wd));
650+
}
651+
652+
#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \
653+
template<int imm> \
654+
inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
655+
{ \
656+
if (imm == 1) \
657+
return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
658+
return imm ? b : a; \
659+
}
660+
661+
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, right, a, b)
662+
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
663+
664+
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, left, b, a)
665+
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
666+
608667
////////// Reduce and mask /////////
609668

610669
/** Reduce **/
@@ -726,7 +785,7 @@ inline int v_signmask(const v_float32x4& a)
726785

727786
inline int v_signmask(const v_int64x2& a)
728787
{
729-
const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63));
788+
VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
730789
return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
731790
}
732791
inline int v_signmask(const v_uint64x2& a)
@@ -812,66 +871,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
812871

813872
/** Rounding **/
814873
inline v_int32x4 v_round(const v_float32x4& a)
815-
{ return v_int32x4(vec_cts(vec_round(a.val), 0)); }
874+
{ return v_int32x4(vec_cts(vec_round(a.val))); }
816875

817876
inline v_int32x4 v_round(const v_float64x2& a)
818-
{
819-
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
820-
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm));
821-
}
877+
{ return v_int32x4(vec_mergesqo(vec_cts(vec_round(a.val)), vec_int4_z)); }
822878

823879
inline v_int32x4 v_floor(const v_float32x4& a)
824-
{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); }
880+
{ return v_int32x4(vec_cts(vec_floor(a.val))); }
825881

826882
inline v_int32x4 v_floor(const v_float64x2& a)
827-
{
828-
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
829-
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm));
830-
}
883+
{ return v_int32x4(vec_mergesqo(vec_cts(vec_floor(a.val)), vec_int4_z)); }
831884

832885
inline v_int32x4 v_ceil(const v_float32x4& a)
833-
{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); }
886+
{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
834887

835888
inline v_int32x4 v_ceil(const v_float64x2& a)
836-
{
837-
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
838-
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm));
839-
}
889+
{ return v_int32x4(vec_mergesqo(vec_cts(vec_ceil(a.val)), vec_int4_z)); }
840890

841891
inline v_int32x4 v_trunc(const v_float32x4& a)
842-
{ return v_int32x4(vec_cts(a.val, 0)); }
892+
{ return v_int32x4(vec_cts(a.val)); }
843893

844894
inline v_int32x4 v_trunc(const v_float64x2& a)
845-
{
846-
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
847-
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm));
848-
}
895+
{ return v_int32x4(vec_mergesqo(vec_cts(a.val), vec_int4_z)); }
849896

850897
/** To float **/
851898
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
852-
{ return v_float32x4(vec_ctf(a.val, 0)); }
899+
{ return v_float32x4(vec_ctf(a.val)); }
853900

854901
inline v_float32x4 v_cvt_f32(const v_float64x2& a)
855-
{
856-
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
857-
return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm));
858-
}
902+
{ return v_float32x4(vec_mergesqo(vec_cvf(a.val), vec_float4_z)); }
903+
859904
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
860-
{
861-
return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0));
862-
}
905+
{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
906+
863907
inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
864-
{
865-
return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0));
866-
}
908+
{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
909+
867910
inline v_float64x2 v_cvt_f64(const v_float32x4& a)
868-
{
869-
return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val)));
870-
}
911+
{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
912+
871913
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
872-
{
873-
return v_float64x2(vec_cvf(vec_mergel(a.val, a.val)));
874-
}
914+
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
875915

876916
/** Reinterpret **/
877917
/** its up there with load and store operations **/
@@ -888,10 +928,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
888928
const vec_float4 v0 = vec_splat(v.val, 0);
889929
const vec_float4 v1 = vec_splat(v.val, 1);
890930
const vec_float4 v2 = vec_splat(v.val, 2);
891-
const vec_float4 v3 = vec_splat(v.val, 3);
931+
VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
892932
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
893933
}
894934

935+
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
936+
const v_float32x4& m1, const v_float32x4& m2,
937+
const v_float32x4& a)
938+
{
939+
const vec_float4 v0 = vec_splat(v.val, 0);
940+
const vec_float4 v1 = vec_splat(v.val, 1);
941+
const vec_float4 v2 = vec_splat(v.val, 2);
942+
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
943+
}
944+
895945
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
896946
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
897947
const _Tpvec& a2, const _Tpvec& a3, \

0 commit comments

Comments
 (0)