@@ -523,24 +523,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
523
523
OPENCV_HAL_IMPL_VSX_BIN_FUNC (v_sub_wrap, vec_sub)
524
524
525
525
/* * Bitwise shifts **/
526
- #define OPENCV_HAL_IMPL_VSX_SHIFT_OP (_Tpuvec, splfunc ) \
527
- inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
528
- { return _Tpuvec (vec_sl (a.val , splfunc (imm))); } \
529
- inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
530
- { return _Tpuvec (vec_sr (a.val , splfunc (imm))); } \
531
- template <int imm> inline _Tpuvec v_shl (const _Tpuvec& a) \
532
- { return _Tpuvec (vec_sl (a.val , splfunc (imm))); } \
533
- template <int imm> inline _Tpuvec v_shr (const _Tpuvec& a) \
534
- { return _Tpuvec (vec_sr (a.val , splfunc (imm))); }
535
-
536
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint8x16, vec_uchar16_sp)
537
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int8x16, vec_uchar16_sp)
538
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint16x8, vec_ushort8_sp)
539
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int16x8, vec_ushort8_sp)
540
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint32x4, vec_uint4_sp)
541
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int32x4, vec_uint4_sp)
542
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint64x2, vec_udword2_sp)
543
- OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int64x2, vec_udword2_sp)
526
+ #define OPENCV_HAL_IMPL_VSX_SHIFT_OP (_Tpvec, shr, splfunc ) \
527
+ inline _Tpvec operator << (const _Tpvec& a, int imm) \
528
+ { return _Tpvec (vec_sl (a.val , splfunc (imm))); } \
529
+ inline _Tpvec operator >> (const _Tpvec& a, int imm) \
530
+ { return _Tpvec (shr (a.val , splfunc (imm))); } \
531
+ template <int imm> inline _Tpvec v_shl (const _Tpvec& a) \
532
+ { return _Tpvec (vec_sl (a.val , splfunc (imm))); } \
533
+ template <int imm> inline _Tpvec v_shr (const _Tpvec& a) \
534
+ { return _Tpvec (shr (a.val , splfunc (imm))); }
535
+
536
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint8x16, vec_sr, vec_uchar16_sp)
537
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint16x8, vec_sr, vec_ushort8_sp)
538
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint32x4, vec_sr, vec_uint4_sp)
539
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_uint64x2, vec_sr, vec_udword2_sp)
540
+ // algebraic right shift
541
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int8x16, vec_sra, vec_uchar16_sp)
542
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int16x8, vec_sra, vec_ushort8_sp)
543
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int32x4, vec_sra, vec_uint4_sp)
544
+ OPENCV_HAL_IMPL_VSX_SHIFT_OP (v_int64x2, vec_sra, vec_udword2_sp)
544
545
545
546
/* * Bitwise logic **/
546
547
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP (_Tpvec ) \
@@ -605,6 +606,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
605
606
OPENCV_HAL_IMPL_VSX_BIN_FUNC (v_min, vec_min)
606
607
OPENCV_HAL_IMPL_VSX_BIN_FUNC (v_max, vec_max)
607
608
609
+ /* * Rotate **/
610
+ #define OPENCV_IMPL_VSX_ROTATE (_Tpvec, suffix, shf, cast ) \
611
+ template <int imm> \
612
+ inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
613
+ { \
614
+ const int wd = imm * sizeof (typename _Tpvec::lane_type); \
615
+ if (wd > 15 ) \
616
+ return _Tpvec (); \
617
+ return _Tpvec ((cast)shf (vec_uchar16_c (a.val ), vec_uchar16_sp (wd << 3 ))); \
618
+ }
619
+
620
+ #define OPENCV_IMPL_VSX_ROTATE_LR (_Tpvec, cast ) \
621
+ OPENCV_IMPL_VSX_ROTATE (_Tpvec, left, vec_slo, cast) \
622
+ OPENCV_IMPL_VSX_ROTATE (_Tpvec, right, vec_sro, cast)
623
+
624
+ OPENCV_IMPL_VSX_ROTATE_LR (v_uint8x16, vec_uchar16)
625
+ OPENCV_IMPL_VSX_ROTATE_LR (v_int8x16, vec_char16)
626
+ OPENCV_IMPL_VSX_ROTATE_LR (v_uint16x8, vec_ushort8)
627
+ OPENCV_IMPL_VSX_ROTATE_LR (v_int16x8, vec_short8)
628
+ OPENCV_IMPL_VSX_ROTATE_LR (v_uint32x4, vec_uint4)
629
+ OPENCV_IMPL_VSX_ROTATE_LR (v_int32x4, vec_int4)
630
+ OPENCV_IMPL_VSX_ROTATE_LR (v_uint64x2, vec_udword2)
631
+ OPENCV_IMPL_VSX_ROTATE_LR (v_int64x2, vec_dword2)
632
+
633
+
634
+ template <int imm, typename _Tpvec>
635
+ inline _Tpvec v_rotate_right (const _Tpvec& a, const _Tpvec& b)
636
+ {
637
+ const int wd = imm * sizeof (typename _Tpvec::lane_type);
638
+ if (wd == 0 )
639
+ return a;
640
+ return _Tpvec (vec_sld (b.val , a.val , 16 - wd));
641
+ }
642
+
643
+ template <int imm, typename _Tpvec>
644
+ inline _Tpvec v_rotate_left (const _Tpvec& a, const _Tpvec& b)
645
+ {
646
+ const int wd = imm * sizeof (typename _Tpvec::lane_type);
647
+ if (wd == 16 )
648
+ return b;
649
+ return _Tpvec (vec_sld (a.val , b.val , wd));
650
+ }
651
+
652
+ #define OPENCV_IMPL_VSX_ROTATE_64 (_Tpvec, suffix, rg1, rg2 ) \
653
+ template <int imm> \
654
+ inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
655
+ { \
656
+ if (imm == 1 ) \
657
+ return _Tpvec (vec_permi (rg1.val , rg2.val , 2 )); \
658
+ return imm ? b : a; \
659
+ }
660
+
661
+ OPENCV_IMPL_VSX_ROTATE_64 (v_int64x2, right, a, b)
662
+ OPENCV_IMPL_VSX_ROTATE_64 (v_uint64x2, right, a, b)
663
+
664
+ OPENCV_IMPL_VSX_ROTATE_64 (v_int64x2, left, b, a)
665
+ OPENCV_IMPL_VSX_ROTATE_64 (v_uint64x2, left, b, a)
666
+
608
667
// //////// Reduce and mask /////////
609
668
610
669
/* * Reduce **/
@@ -726,7 +785,7 @@ inline int v_signmask(const v_float32x4& a)
726
785
727
786
inline int v_signmask (const v_int64x2& a)
728
787
{
729
- const vec_dword2 sv = vec_sr (a.val , vec_udword2_sp (63 ));
788
+ VSX_UNUSED ( const vec_dword2) sv = vec_sr (a.val , vec_udword2_sp (63 ));
730
789
return (int )vec_extract (sv, 0 ) | (int )vec_extract (sv, 1 ) << 1 ;
731
790
}
732
791
inline int v_signmask (const v_uint64x2& a)
@@ -812,66 +871,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
812
871
813
872
/* * Rounding **/
814
873
inline v_int32x4 v_round (const v_float32x4& a)
815
- { return v_int32x4 (vec_cts (vec_round (a.val ), 0 )); }
874
+ { return v_int32x4 (vec_cts (vec_round (a.val ))); }
816
875
817
876
inline v_int32x4 v_round (const v_float64x2& a)
818
- {
819
- static const vec_uchar16 perm = {16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
820
- return v_int32x4 (vec_perm (vec_int4_z, vec_ctsw (vec_round (a.val )), perm));
821
- }
877
+ { return v_int32x4 (vec_mergesqo (vec_cts (vec_round (a.val )), vec_int4_z)); }
822
878
823
879
inline v_int32x4 v_floor (const v_float32x4& a)
824
- { return v_int32x4 (vec_cts (vec_floor (a.val ), 0 )); }
880
+ { return v_int32x4 (vec_cts (vec_floor (a.val ))); }
825
881
826
882
inline v_int32x4 v_floor (const v_float64x2& a)
827
- {
828
- static const vec_uchar16 perm = {16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
829
- return v_int32x4 (vec_perm (vec_int4_z, vec_ctsw (vec_floor (a.val )), perm));
830
- }
883
+ { return v_int32x4 (vec_mergesqo (vec_cts (vec_floor (a.val )), vec_int4_z)); }
831
884
832
885
inline v_int32x4 v_ceil (const v_float32x4& a)
833
- { return v_int32x4 (vec_cts (vec_ceil (a.val ), 0 )); }
886
+ { return v_int32x4 (vec_cts (vec_ceil (a.val ))); }
834
887
835
888
inline v_int32x4 v_ceil (const v_float64x2& a)
836
- {
837
- static const vec_uchar16 perm = {16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
838
- return v_int32x4 (vec_perm (vec_int4_z, vec_ctsw (vec_ceil (a.val )), perm));
839
- }
889
+ { return v_int32x4 (vec_mergesqo (vec_cts (vec_ceil (a.val )), vec_int4_z)); }
840
890
841
891
inline v_int32x4 v_trunc (const v_float32x4& a)
842
- { return v_int32x4 (vec_cts (a.val , 0 )); }
892
+ { return v_int32x4 (vec_cts (a.val )); }
843
893
844
894
inline v_int32x4 v_trunc (const v_float64x2& a)
845
- {
846
- static const vec_uchar16 perm = {16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
847
- return v_int32x4 (vec_perm (vec_int4_z, vec_ctsw (a.val ), perm));
848
- }
895
+ { return v_int32x4 (vec_mergesqo (vec_cts (a.val ), vec_int4_z)); }
849
896
850
897
/* * To float **/
851
898
inline v_float32x4 v_cvt_f32 (const v_int32x4& a)
852
- { return v_float32x4 (vec_ctf (a.val , 0 )); }
899
+ { return v_float32x4 (vec_ctf (a.val )); }
853
900
854
901
inline v_float32x4 v_cvt_f32 (const v_float64x2& a)
855
- {
856
- static const vec_uchar16 perm = {16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
857
- return v_float32x4 (vec_perm (vec_float4_z, vec_cvf (a.val ), perm));
858
- }
902
+ { return v_float32x4 (vec_mergesqo (vec_cvf (a.val ), vec_float4_z)); }
903
+
859
904
inline v_float64x2 v_cvt_f64 (const v_int32x4& a)
860
- {
861
- return v_float64x2 (vec_ctd (vec_mergeh (a.val , a.val ), 0 ));
862
- }
905
+ { return v_float64x2 (vec_ctdo (vec_mergeh (a.val , a.val ))); }
906
+
863
907
inline v_float64x2 v_cvt_f64_high (const v_int32x4& a)
864
- {
865
- return v_float64x2 (vec_ctd (vec_mergel (a.val , a.val ), 0 ));
866
- }
908
+ { return v_float64x2 (vec_ctdo (vec_mergel (a.val , a.val ))); }
909
+
867
910
inline v_float64x2 v_cvt_f64 (const v_float32x4& a)
868
- {
869
- return v_float64x2 (vec_cvf (vec_mergeh (a.val , a.val )));
870
- }
911
+ { return v_float64x2 (vec_cvfo (vec_mergeh (a.val , a.val ))); }
912
+
871
913
inline v_float64x2 v_cvt_f64_high (const v_float32x4& a)
872
- {
873
- return v_float64x2 (vec_cvf (vec_mergel (a.val , a.val )));
874
- }
914
+ { return v_float64x2 (vec_cvfo (vec_mergel (a.val , a.val ))); }
875
915
876
916
/* * Reinterpret **/
877
917
/* * its up there with load and store operations **/
@@ -888,10 +928,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
888
928
const vec_float4 v0 = vec_splat (v.val , 0 );
889
929
const vec_float4 v1 = vec_splat (v.val , 1 );
890
930
const vec_float4 v2 = vec_splat (v.val , 2 );
891
- const vec_float4 v3 = vec_splat (v.val , 3 );
931
+ VSX_UNUSED ( const vec_float4) v3 = vec_splat (v.val , 3 );
892
932
return v_float32x4 (vec_madd (v0, m0.val , vec_madd (v1, m1.val , vec_madd (v2, m2.val , vec_mul (v3, m3.val )))));
893
933
}
894
934
935
+ inline v_float32x4 v_matmuladd (const v_float32x4& v, const v_float32x4& m0,
936
+ const v_float32x4& m1, const v_float32x4& m2,
937
+ const v_float32x4& a)
938
+ {
939
+ const vec_float4 v0 = vec_splat (v.val , 0 );
940
+ const vec_float4 v1 = vec_splat (v.val , 1 );
941
+ const vec_float4 v2 = vec_splat (v.val , 2 );
942
+ return v_float32x4 (vec_madd (v0, m0.val , vec_madd (v1, m1.val , vec_madd (v2, m2.val , a.val ))));
943
+ }
944
+
895
945
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4 (_Tpvec, _Tpvec2 ) \
896
946
inline void v_transpose4x4 (const _Tpvec& a0, const _Tpvec& a1, \
897
947
const _Tpvec& a2, const _Tpvec& a3, \
0 commit comments