Add files via upload

fix bug with load_bits under AVX
lacrymose · Mar 6, 2021 · ff7450a · ff7450a
1 parent 46a5841
commit ff7450a
Showing 1 changed file with 16 additions and 18 deletions.
diff --git a/vectorf256.h b/vectorf256.h
@@ -1,7 +1,7 @@
 /****************************  vectorf256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2020-03-26
+* Last modified: 2021-03-06
 * Version:       2.01.02
 * Project:       vector class library
 * Description:
@@ -18,7 +18,7 @@
 * Each vector object is represented internally in the CPU as a 256-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2020 Agner Fog.
+* (c) Copyright 2012-2021 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -158,12 +158,11 @@ class Vec8fb {
         return *this;
     }
     // Member function to change a bitfield to a boolean vector
-    // AVX version. Use float instructions, treating integers as subnormal values
+    // AVX version. Cannot use float instructions if subnormals are disabled
     Vec8fb & load_bits(uint8_t a) {
-        __m256 b1 = _mm256_castsi256_ps(_mm256_set1_epi32((int32_t)a));  // broadcast a
-        __m256 m2 = constant8f<1,2,4,8,0x10,0x20,0x40,0x80>();
-        __m256 d1 = _mm256_and_ps(b1, m2); // isolate one bit in each dword
-        ymm = _mm256_cmp_ps(d1, _mm256_setzero_ps(), 4);  // compare subnormal values with 0
+        Vec4fb y0 = Vec4fb().load_bits(a);
+        Vec4fb y1 = Vec4fb().load_bits(uint8_t(a >> 4u));
+        *this = Vec8fb(y0, y1);
         return *this;
     }
     // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
@@ -408,12 +407,11 @@ class Vec4db {
         return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high()));
     }
     // Member function to change a bitfield to a boolean vector
-    // AVX version. Use float instructions, treating integers as subnormal values
+    // AVX version. Cannot use float instructions if subnormals are disabled
     Vec4db & load_bits(uint8_t a) {
-        __m256d b1 = _mm256_castsi256_pd(_mm256_set1_epi32((int32_t)a));  // broadcast a
-        __m256d m2 = _mm256_castps_pd(constant8f<1,0,2,0,4,0,8,0>());
-        __m256d d1 = _mm256_and_pd(b1, m2); // isolate one bit in each dword
-        ymm = _mm256_cmp_pd(d1, _mm256_setzero_pd(), 4);  // compare subnormal values with 0
+        Vec2db a0 = Vec2db().load_bits(a);
+        Vec2db a1 = Vec2db().load_bits(uint8_t(a>>2u));
+        *this = Vec4db(a0, a1);
         return *this;
     }
 #endif // AVX2
@@ -1043,7 +1041,7 @@ static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) {
 
 // Categorization functions
 
-// Function is_finite: gives true for elements that are normal, denormal or zero,
+// Function is_finite: gives true for elements that are normal, subnormal or zero,
 // false for INF and NAN
 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
 static inline Vec8fb is_finite(Vec8f const a) {
@@ -1103,7 +1101,7 @@ static inline Vec8fb is_nan(Vec8f const a) {
 #endif
 
 
-// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// Function is_subnormal: gives true for elements that are subnormal (denormal)
 // false for finite numbers, zero, NAN and INF
 static inline Vec8fb is_subnormal(Vec8f const a) {
 #if INSTRSET >= 10  // compact boolean vectors
@@ -1411,7 +1409,7 @@ static inline Vec8f fraction(Vec8f const a) {
 // n  =    0 gives 1.0f
 // n >=  128 gives +INF
 // n <= -127 gives 0.0f
-// This function will never produce denormals, and never raise exceptions
+// This function will never produce subnormals, and never raise exceptions
 static inline Vec8f exp2(Vec8i const n) {
 #if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
     Vec8i t1 = max(n,  -0x7F);         // limit to allowed range
@@ -1874,7 +1872,7 @@ static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) {
 #endif
 }
 
-// Function is_finite: gives true for elements that are normal, denormal or zero,
+// Function is_finite: gives true for elements that are normal, subnormal or zero,
 // false for INF and NAN
 static inline Vec4db is_finite(Vec4d const a) {
 #if INSTRSET >= 10  // compact boolean vectors
@@ -1935,7 +1933,7 @@ static inline Vec4db is_nan(Vec4d const a) {
 #endif
 
 
-// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// Function is_subnormal: gives true for elements that are subnormal (denormal)
 // false for finite numbers, zero, NAN and INF
 static inline Vec4db is_subnormal(Vec4d const a) {
 #if INSTRSET >= 10  // compact boolean vectors
@@ -2244,7 +2242,7 @@ static inline Vec4d fraction(Vec4d const a) {
 // n  =     0 gives 1.0
 // n >=  1024 gives +INF
 // n <= -1023 gives 0.0
-// This function will never produce denormals, and never raise exceptions
+// This function will never produce subnormals, and never raise exceptions
 static inline Vec4d exp2(Vec4q const n) {
 #if INSTRSET >= 8  // 256 bit integer vectors are available
     Vec4q t1 = max(n,  -0x3FF);        // limit to allowed range