Skip to content

Commit

Permalink
Fixes warning C4310 from Visual Studio (intel#26)
Browse files Browse the repository at this point in the history
* Fixed warnings in Visual Studio.

* Fixed warning in VisualStudio, regarding conversion from double to float, or float to double.

* Added comments with the constants in hexadecimal to help with code readability.
  • Loading branch information
Bizonu authored and Zvictoria committed Feb 27, 2019
1 parent 6516a61 commit 91c5962
Showing 1 changed file with 32 additions and 32 deletions.
64 changes: 32 additions & 32 deletions NEON_2_SSE.h
Original file line number Diff line number Diff line change
Expand Up @@ -3147,7 +3147,7 @@ _NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0
{
//no signed average in x86 SIMD, go to unsigned
__m128i c128, au, bu, sum;
c128 = _mm_set1_epi8((int8_t)0x80); //-128
c128 = _mm_set1_epi8(-128); //(int8_t)0x80
au = _mm_sub_epi8(a, c128); //add 128
bu = _mm_sub_epi8(b, c128); //add 128
sum = _mm_avg_epu8(au, bu);
Expand All @@ -3159,7 +3159,7 @@ _NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16
{
//no signed average in x86 SIMD, go to unsigned
__m128i cx8000, au, bu, sum;
cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
cx8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
au = _mm_sub_epi16(a, cx8000); //add 32768
bu = _mm_sub_epi16(b, cx8000); //add 32768
sum = _mm_avg_epu16(au, bu);
Expand Down Expand Up @@ -4774,9 +4774,9 @@ _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
{
// //need to deal with the possibility of internal overflow
//need to deal with the possibility of internal overflow
__m128i c128, au,bu;
c128 = _mm_set1_epi8((int8_t)128);
c128 = _mm_set1_epi8(-128); //(int8_t)0x80
au = _mm_add_epi8( a, c128);
bu = _mm_add_epi8( b, c128);
return vhsubq_u8(au,bu);
Expand All @@ -4787,7 +4787,7 @@ _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,
{
//need to deal with the possibility of internal overflow
__m128i c8000, au,bu;
c8000 = _mm_set1_epi16((int16_t)0x8000);
c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
au = _mm_add_epi16( a, c8000);
bu = _mm_add_epi16( b, c8000);
return vhsubq_u16(au,bu);
Expand Down Expand Up @@ -6312,7 +6312,7 @@ _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d
uint16x4_t res64;
__m128i c32767, cfffe, as, bs, res;
c32767 = _mm_set1_epi16 (32767);
cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
cfffe = _mm_set1_epi16 (-2); //(int16_t)0xfffe
as = _mm_sub_epi16 (_pM128i(a), c32767);
bs = _mm_sub_epi16 (_pM128i(b), c32767);
res = _mm_hadd_epi16 (as, bs);
Expand Down Expand Up @@ -6798,11 +6798,11 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t
res.m64_u32[i] = 0xffffffff;
}else{
resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
res.m64_u32[i] = r * (uint32_t)(1 << 31);
q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
r = (float)(1.0f / (((float)q + 0.5f) / 512.0f)); /* reciprocal r */
s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0f;
res.m64_u32[i] = (uint32_t)(r * (uint32_t)(1 << 31));
}
}
return res;
Expand All @@ -6827,10 +6827,10 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t
zero = _mm_setzero_si128();
for (i =0; i<4; i++){
resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
r = 1.0f / (((float)q + 0.5f) / 512.0f); /* reciprocal r */
s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0f;
res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
}
res128 = _mm_load_si128((__m128i*)res);
Expand Down Expand Up @@ -6868,14 +6868,14 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t
res.m64_u32[i] = 0xffffffff;
}else{
resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
coeff = (resf < 0.5f)? 512.0f : 256.0f ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
r = ((float)q0 + 0.5) / coeff;
r = ((float)q0 + 0.5f) / coeff;
tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
_mm_store_ss(&r, tmp);
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)(s / 256.0);
res.m64_u32[i] = r * (((uint32_t)1) << 31);
s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
r = (float)(s / 256.0f);
res.m64_u32[i] = (uint32_t)(r * (((uint32_t)1) << 31));
}
}
return res;
Expand All @@ -6899,13 +6899,13 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_
zero = _mm_setzero_si128();
for (i =0; i<4; i++){
resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
coeff = (float)((resf < 0.5f)? 512.0f : 256.0f); /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
r = ((float)q0 + 0.5) / coeff;
r = ((float)q0 + 0.5f) / coeff;
tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
_mm_store_ss(&r, tmp);
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0;
s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
r = (float)s / 256.0f;
res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
}
res128 = _mm_load_si128((__m128i*)res);
Expand Down Expand Up @@ -6955,8 +6955,8 @@ _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQ
_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
{
__m128 f3, f05, mul;
f3 = _mm_set1_ps(3.);
f05 = _mm_set1_ps(0.5);
f3 = _mm_set1_ps(3.f);
f05 = _mm_set1_ps(0.5f);
mul = _mm_mul_ps(a,b);
f3 = _mm_sub_ps(f3,mul);
return _mm_mul_ps (f3, f05);
Expand Down Expand Up @@ -8393,7 +8393,7 @@ _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b)
// manual saturation solution looks more optimal than 32 bits conversion one
__m128i cb, c8000, a_signed, saturation_mask, shift_res;
cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
c8000 = _mm_set1_epi16 ((int16_t)0x8000);
c8000 = _mm_set1_epi16 (-32768); // (int16_t)0x8000
//no unsigned shorts comparison in SSE, only signed available, so need the trick
a_signed = _mm_sub_epi16(a, c8000); //go to signed
saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
Expand Down Expand Up @@ -14692,7 +14692,7 @@ _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
{
__m128i c_128, abs, abs_cmp;
c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128
c_128 = _mm_set1_epi8 (-128); //(int8_t)0x80
abs = _mm_abs_epi8 (a);
abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
return _mm_xor_si128 (abs, abs_cmp);
Expand All @@ -14702,7 +14702,7 @@ _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
{
__m128i c_32768, abs, abs_cmp;
c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768
c_32768 = _mm_set1_epi16 (-32768); //(int16_t)0x8000
abs = _mm_abs_epi16 (a);
abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
return _mm_xor_si128 (abs, abs_cmp);
Expand Down Expand Up @@ -15015,7 +15015,7 @@ _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
{
__m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
cff = _mm_cmpeq_epi8 (a,a); //0xff
c80 = _mm_set1_epi8((int8_t)0x80);
c80 = _mm_set1_epi8(-128); //(int8_t)0x80
c1 = _mm_set1_epi8(1);
a_mask = _mm_and_si128(a, c80);
a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
Expand Down Expand Up @@ -16711,8 +16711,8 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_
{
_NEON2SSE_ALIGN_16 float64_t res[2];
_mm_store_pd(res, a);
res[0] = nearbyintf(res[0]);
res[1] = nearbyintf(res[1]);
res[0] = nearbyint(res[0]);
res[1] = nearbyint(res[1]);
return _mm_load_pd(res);
}
#endif
Expand Down

0 comments on commit 91c5962

Please sign in to comment.