Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
fix bug with load_bits under AVX
  • Loading branch information
AgnerF authored Mar 6, 2021
1 parent 46a5841 commit ff7450a
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions vectorf256.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**************************** vectorf256.h *******************************
* Author: Agner Fog
* Date created: 2012-05-30
* Last modified: 2020-03-26
* Last modified: 2021-03-06
* Version: 2.01.02
* Project: vector class library
* Description:
Expand All @@ -18,7 +18,7 @@
* Each vector object is represented internally in the CPU as a 256-bit register.
* This header file defines operators and functions for these vectors.
*
* (c) Copyright 2012-2020 Agner Fog.
* (c) Copyright 2012-2021 Agner Fog.
* Apache License version 2.0 or later.
*****************************************************************************/

Expand Down Expand Up @@ -158,12 +158,11 @@ class Vec8fb {
return *this;
}
// Member function to change a bitfield to a boolean vector
// AVX version. Use float instructions, treating integers as subnormal values
// AVX version. Cannot use float instructions if subnormals are disabled
Vec8fb & load_bits(uint8_t a) {
__m256 b1 = _mm256_castsi256_ps(_mm256_set1_epi32((int32_t)a)); // broadcast a
__m256 m2 = constant8f<1,2,4,8,0x10,0x20,0x40,0x80>();
__m256 d1 = _mm256_and_ps(b1, m2); // isolate one bit in each dword
ymm = _mm256_cmp_ps(d1, _mm256_setzero_ps(), 4); // compare subnormal values with 0
Vec4fb y0 = Vec4fb().load_bits(a);
Vec4fb y1 = Vec4fb().load_bits(uint8_t(a >> 4u));
*this = Vec8fb(y0, y1);
return *this;
}
// Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
Expand Down Expand Up @@ -408,12 +407,11 @@ class Vec4db {
return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high()));
}
// Member function to change a bitfield to a boolean vector
// AVX version. Use float instructions, treating integers as subnormal values
// AVX version. Cannot use float instructions if subnormals are disabled
Vec4db & load_bits(uint8_t a) {
__m256d b1 = _mm256_castsi256_pd(_mm256_set1_epi32((int32_t)a)); // broadcast a
__m256d m2 = _mm256_castps_pd(constant8f<1,0,2,0,4,0,8,0>());
__m256d d1 = _mm256_and_pd(b1, m2); // isolate one bit in each dword
ymm = _mm256_cmp_pd(d1, _mm256_setzero_pd(), 4); // compare subnormal values with 0
Vec2db a0 = Vec2db().load_bits(a);
Vec2db a1 = Vec2db().load_bits(uint8_t(a>>2u));
*this = Vec4db(a0, a1);
return *this;
}
#endif // AVX2
Expand Down Expand Up @@ -1043,7 +1041,7 @@ static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) {

// Categorization functions

// Function is_finite: gives true for elements that are normal, denormal or zero,
// Function is_finite: gives true for elements that are normal, subnormal or zero,
// false for INF and NAN
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
static inline Vec8fb is_finite(Vec8f const a) {
Expand Down Expand Up @@ -1103,7 +1101,7 @@ static inline Vec8fb is_nan(Vec8f const a) {
#endif


// Function is_subnormal: gives true for elements that are denormal (subnormal)
// Function is_subnormal: gives true for elements that are subnormal (denormal)
// false for finite numbers, zero, NAN and INF
static inline Vec8fb is_subnormal(Vec8f const a) {
#if INSTRSET >= 10 // compact boolean vectors
Expand Down Expand Up @@ -1411,7 +1409,7 @@ static inline Vec8f fraction(Vec8f const a) {
// n = 0 gives 1.0f
// n >= 128 gives +INF
// n <= -127 gives 0.0f
// This function will never produce denormals, and never raise exceptions
// This function will never produce subnormals, and never raise exceptions
static inline Vec8f exp2(Vec8i const n) {
#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2
Vec8i t1 = max(n, -0x7F); // limit to allowed range
Expand Down Expand Up @@ -1874,7 +1872,7 @@ static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) {
#endif
}

// Function is_finite: gives true for elements that are normal, denormal or zero,
// Function is_finite: gives true for elements that are normal, subnormal or zero,
// false for INF and NAN
static inline Vec4db is_finite(Vec4d const a) {
#if INSTRSET >= 10 // compact boolean vectors
Expand Down Expand Up @@ -1935,7 +1933,7 @@ static inline Vec4db is_nan(Vec4d const a) {
#endif


// Function is_subnormal: gives true for elements that are denormal (subnormal)
// Function is_subnormal: gives true for elements that are subnormal (denormal)
// false for finite numbers, zero, NAN and INF
static inline Vec4db is_subnormal(Vec4d const a) {
#if INSTRSET >= 10 // compact boolean vectors
Expand Down Expand Up @@ -2244,7 +2242,7 @@ static inline Vec4d fraction(Vec4d const a) {
// n = 0 gives 1.0
// n >= 1024 gives +INF
// n <= -1023 gives 0.0
// This function will never produce denormals, and never raise exceptions
// This function will never produce subnormals, and never raise exceptions
static inline Vec4d exp2(Vec4q const n) {
#if INSTRSET >= 8 // 256 bit integer vectors are available
Vec4q t1 = max(n, -0x3FF); // limit to allowed range
Expand Down

0 comments on commit ff7450a

Please sign in to comment.