Skip to content

Commit

Permalink
volk: Added avx proto-kernel for fast exp.
Browse files Browse the repository at this point in the history
  • Loading branch information
abhowmick22 authored and trondeau committed Oct 15, 2014
1 parent 81dbe85 commit 63003ae
Showing 1 changed file with 78 additions and 0 deletions.
78 changes: 78 additions & 0 deletions kernels/volk/volk_32f_expfast_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,45 @@
#ifndef INCLUDED_volk_32f_expfast_32f_a_H
#define INCLUDED_volk_32f_expfast_32f_a_H

#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Computes fast exp (max 7% error) of input vector and stores results in output vector
\param bVector The vector where results will be stored
\param aVector The input vector of floats
\param num_points Number of points for which log is to be computed
*/
static inline void volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points){

float* bPtr = bVector;
const float* aPtr = aVector;

unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;

__m256 aVal, bVal, a, b;
__m256i exp;
a = _mm256_set1_ps(A/Mln2);
b = _mm256_set1_ps(B-C);

for(;number < eighthPoints; number++){
aVal = _mm256_load_ps(aPtr);
exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
bVal = _mm256_castsi256_ps(exp);

_mm256_store_ps(bPtr, bVal);
aPtr += 8;
bPtr += 8;
}

number = eighthPoints * 8;
for(;number < num_points; number++){
*bPtr++ = expf(*aPtr++);
}
}

#endif /* LV_HAVE_AVX for aligned */

#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
Expand Down Expand Up @@ -76,6 +115,45 @@ static inline void volk_32f_expfast_32f_a_generic(float* bVector, const float* a
#ifndef INCLUDED_volk_32f_expfast_32f_u_H
#define INCLUDED_volk_32f_expfast_32f_u_H

#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Computes fast exp (max 7% error) of input vector and stores results in output vector
\param bVector The vector where results will be stored
\param aVector The input vector of floats
\param num_points Number of points for which log is to be computed
*/
static inline void volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points){

float* bPtr = bVector;
const float* aPtr = aVector;

unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;

__m256 aVal, bVal, a, b;
__m256i exp;
a = _mm256_set1_ps(A/Mln2);
b = _mm256_set1_ps(B-C);

for(;number < eighthPoints; number++){
aVal = _mm256_loadu_ps(aPtr);
exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
bVal = _mm256_castsi256_ps(exp);

_mm256_storeu_ps(bPtr, bVal);
aPtr += 8;
bPtr += 8;
}

number = eighthPoints * 8;
for(;number < num_points; number++){
*bPtr++ = expf(*aPtr++);
}
}

#endif /* LV_HAVE_AVX for aligned */

#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
Expand Down

0 comments on commit 63003ae

Please sign in to comment.