Skip to content

Commit

Permalink
VFP support
Browse files Browse the repository at this point in the history
  • Loading branch information
anthonix committed Apr 3, 2013
1 parent 17aaf9d commit 7f43121
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 39 deletions.
3 changes: 3 additions & 0 deletions config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H

/* Define to FFT with ARM VFP. */
#undef HAVE_VFP

/* Define to 1 if the system has the type `_Bool'. */
#undef HAVE__BOOL

Expand Down
32 changes: 32 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,8 @@ ac_subst_vars='am__EXEEXT_FALSE
am__EXEEXT_TRUE
LTLIBOBJS
LIBOBJS
HAVE_VFP_FALSE
HAVE_VFP_TRUE
HAVE_NEON_FALSE
HAVE_NEON_TRUE
HAVE_SSE_FALSE
Expand Down Expand Up @@ -776,6 +778,7 @@ enable_dynamic_code
enable_single
enable_sse
enable_neon
enable_vfp
'
ac_precious_vars='build_alias
host_alias
Expand Down Expand Up @@ -1423,6 +1426,7 @@ Optional Features:
--enable-single compile single-precision library
--enable-sse enable SSE extensions
--enable-neon enable NEON extensions
--enable-vfp enable VFP extensions

Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
Expand Down Expand Up @@ -15428,6 +15432,30 @@ else
fi


# Check whether --enable-vfp was given.
if test "${enable_vfp+set}" = set; then :
enableval=$enable_vfp; have_vfp=$enableval
else
have_vfp=no
fi

if test "$have_vfp" = "yes"; then
if test "$SIMD" != "sse"; then
as_fn_error $? "conflicting SIMD extensisons specified" "$LINENO" 5
fi

$as_echo "#define HAVE_VFP 1" >>confdefs.h

fi
if test "$have_vfp" = "yes"; then
HAVE_VFP_TRUE=
HAVE_VFP_FALSE='#'
else
HAVE_VFP_TRUE='#'
HAVE_VFP_FALSE=
fi


#AC_CANONICAL_HOST
{ $as_echo "$as_me:${as_lineno-$LINENO}: host is \"${host}\"" >&5
$as_echo "$as_me: host is \"${host}\"" >&6;}
Expand Down Expand Up @@ -16195,6 +16223,10 @@ if test -z "${HAVE_NEON_TRUE}" && test -z "${HAVE_NEON_FALSE}"; then
as_fn_error $? "conditional \"HAVE_NEON\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_VFP_TRUE}" && test -z "${HAVE_VFP_FALSE}"; then
as_fn_error $? "conditional \"HAVE_VFP\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi

: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
Expand Down
9 changes: 9 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ if test "$have_neon" = "yes"; then
fi
AM_CONDITIONAL(HAVE_NEON, test "$have_neon" = "yes")

AC_ARG_ENABLE(vfp, [AC_HELP_STRING([--enable-vfp],[enable VFP extensions])], have_vfp=$enableval, have_vfp=no)
if test "$have_vfp" = "yes"; then
if test "$SIMD" != "sse"; then
AC_MSG_ERROR([conflicting SIMD extensisons specified])
fi
AC_DEFINE(HAVE_VFP,1,[Define to FFT with ARM VFP.])
fi
AM_CONDITIONAL(HAVE_VFP, test "$have_vfp" = "yes")

#AC_CANONICAL_HOST
AC_MSG_NOTICE([host is "${host}"])
case "${host}" in
Expand Down
7 changes: 6 additions & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,21 @@ endif
libffts_includedir=$(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h


if HAVE_VFP
libffts_la_SOURCES += vfp.s
else
if HAVE_NEON

if DYNAMIC_DISABLED
libffts_la_SOURCES += neon_static_f.s neon_static_i.s
else
libffts_la_SOURCES += neon.s vfp.s
libffts_la_SOURCES += neon.s
endif

else
if HAVE_SSE
libffts_la_SOURCES += sse.s
endif
endif
endif
27 changes: 15 additions & 12 deletions src/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ build_triplet = @build@
host_triplet = @host@
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__append_3 = neon_static_f.s neon_static_i.s
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s vfp.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__append_5 = sse.s
@HAVE_VFP_TRUE@am__append_3 = vfp.s
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon_static_f.s neon_static_i.s
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
subdir = src
DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
$(srcdir)/Makefile.in $(top_srcdir)/depcomp
Expand Down Expand Up @@ -99,18 +100,20 @@ am__installdirs = "$(DESTDIR)$(libdir)" \
LTLIBRARIES = $(lib_LTLIBRARIES)
libffts_la_LIBADD =
am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
ffts_real_nd.c patterns.c ffts_static.c codegen.c \
neon_static_f.s neon_static_i.s neon.s vfp.s sse.s
ffts_real_nd.c patterns.c ffts_static.c codegen.c vfp.s \
neon_static_f.s neon_static_i.s neon.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__objects_3 = \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_i.lo
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo vfp.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__objects_5 = sse.lo
@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@ neon_static_i.lo
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo
am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
$(am__objects_3) $(am__objects_4) $(am__objects_5)
$(am__objects_3) $(am__objects_4) $(am__objects_5) \
$(am__objects_6)
libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
Expand Down Expand Up @@ -263,7 +266,7 @@ top_srcdir = @top_srcdir@
lib_LTLIBRARIES = libffts.la
libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c \
patterns.c $(am__append_1) $(am__append_2) $(am__append_3) \
$(am__append_4) $(am__append_5)
$(am__append_4) $(am__append_5) $(am__append_6)
libffts_includedir = $(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
all: all-am
Expand Down
35 changes: 18 additions & 17 deletions src/codegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@
#include <sys/types.h>
#include <sys/mman.h>

#ifdef __ARM_NEON__
#include "codegen_neon.h"
// #include "neon_float.h"
#ifdef HAVE_NEON
#include "codegen_arm.h"
#include "neon.h"
#elif HAVE_VFP
#include "codegen_arm.h"
#include "vfp.h"
#else
#include "codegen_sse.h"
Expand Down Expand Up @@ -99,15 +100,15 @@ uint32_t LUT_offset(size_t N, size_t leafN) {
for(i=0;i<n_luts-1;i++) {
p_lut_size = lut_size;
if(!i || hardcoded) {
#ifdef __ARM_NEON__
#ifdef __arm__
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
else lut_size += n/4 * sizeof(cdata_t);
#else
lut_size += n/4 * 2 * sizeof(cdata_t);
#endif
// n *= 2;
} else {
#ifdef __ARM_NEON__
#ifdef __arm__
lut_size += n/8 * 3 * sizeof(cdata_t);
#else
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
Expand All @@ -118,7 +119,7 @@ uint32_t LUT_offset(size_t N, size_t leafN) {
return lut_size;
}

#ifdef __ARM_NEON__
#ifdef __arm__
typedef uint32_t insns_t;
#else
typedef uint8_t insns_t;
Expand Down Expand Up @@ -147,7 +148,7 @@ void insert_nops(uint8_t **p, uint32_t count) {


void align_mem16(uint8_t **p, uint32_t offset) {
#ifdef __ARM_NEON__
#ifdef __x86_64__
int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
r = (16 + r) & 0xf;
insert_nops(p, r);
Expand All @@ -170,7 +171,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {

pps = ps;

#ifdef __ARM_NEON__
#ifdef __arm__
if(N < 8192) p->transform_size = 8192;
else p->transform_size = N;
#else
Expand Down Expand Up @@ -203,7 +204,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {

insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
Expand All @@ -229,7 +230,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
insns_t *x_4_addr = fp;
#ifdef __arm__

#ifdef __ARM_NEON__
#ifdef HAVE_NEON
memcpy(fp, neon_x4, neon_x8 - neon_x4);
if(sign < 0) {
fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
Expand All @@ -248,7 +249,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
#endif
insns_t *start = fp;

#ifdef __ARM_NEON__
#ifdef __arm__
*fp = PUSH_LR(); fp++;
*fp = 0xed2d8b10; fp++;

Expand All @@ -271,7 +272,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {

#ifdef __arm__
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
MOVI(&fp, 11, p->i0);
#else
MOVI(&fp, 11, p->i0);
Expand All @@ -291,7 +292,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
#endif
//fp++;
#ifdef __arm__
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
memcpy(fp, neon_ee, neon_oo - neon_ee);
if(sign < 0) {
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
Expand Down Expand Up @@ -425,14 +426,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {


if(pps[0] == 2*leafN) {
// CALL(&fp, x_4_addr);
CALL(&fp, x_4_addr);
// }else if(!pps[2]){
// //uint32_t *x_8_t_addr = fp;
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
// fp += (neon_ee - neon_x8_t) / 4;
// //*fp++ = BL(fp+2, x_8_t_addr);
}else{
// CALL(&fp, x_8_addr);
CALL(&fp, x_8_addr);
}

pAddr = pps[1] * 4;
Expand All @@ -445,7 +446,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
#endif
#ifdef __arm__
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
Expand Down Expand Up @@ -612,7 +613,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
*fp = BL(fp+2, x_4_addr); fp++;
}else if(!pps[2]){
//uint32_t *x_8_t_addr = fp;
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
Expand Down
4 changes: 2 additions & 2 deletions src/codegen_neon.h → src/codegen_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
*/

#ifndef __CODEGEN_NEON_H__
#define __CODEGEN_NEON_H__
#ifndef __CODEGEN_ARM_H__
#define __CODEGEN_ARM_H__



Expand Down
14 changes: 7 additions & 7 deletions src/ffts.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
if(N >= 32) {
ffts_init_offsets(p, N, leafN);
#ifdef __arm__
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
ffts_init_is(p, N, leafN, 1);
#else
ffts_init_is(p, N, leafN, 1);
Expand All @@ -120,7 +120,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
p->i2 = N/leafN/3;

#ifdef __arm__
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
p->i0/=2;
p->i1/=2;
#endif
Expand Down Expand Up @@ -164,15 +164,15 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {

for(i=0;i<n_luts;i++) {
if(!i || hardcoded) {
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
else lut_size += n/4 * sizeof(cdata_t);
#else
lut_size += n/4 * 2 * sizeof(cdata_t);
#endif
n *= 2;
} else {
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
lut_size += n/8 * 3 * sizeof(cdata_t);
#else
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
Expand All @@ -196,7 +196,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
n = leafN*2;
if(hardcoded) n = 8;

#ifdef __ARM_NEON__
#ifdef HAVE_NEON
V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
#endif

Expand Down Expand Up @@ -234,7 +234,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
for(j=0;j<n/4;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
Expand Down Expand Up @@ -289,7 +289,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;
#ifdef __ARM_NEON__
#ifdef HAVE_NEON
for(j=0;j<n/8;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
Expand Down

0 comments on commit 7f43121

Please sign in to comment.