From ae3be6502bb38dd0f51477f237b38bd1ec72a8cf Mon Sep 17 00:00:00 2001 From: Mihai Preda Date: Mon, 20 Aug 2018 23:36:05 +1000 Subject: [PATCH] small cleanup, rm old --- .gitignore | 3 +- bug/Makefile | 2 - bug/bug.cl | 98 -------------------- bug/bug.cpp | 45 --------- bug/clwrap.h | 238 ------------------------------------------------ bug/common.h | 34 ------- bug/timeutil.h | 28 ------ bug/tinycl.h | 102 --------------------- cuda/timeutil.h | 29 ------ gpuowl.cl | 17 ---- 10 files changed, 2 insertions(+), 594 deletions(-) delete mode 100644 bug/Makefile delete mode 100644 bug/bug.cl delete mode 100644 bug/bug.cpp delete mode 100644 bug/clwrap.h delete mode 100644 bug/common.h delete mode 100644 bug/timeutil.h delete mode 100644 bug/tinycl.h delete mode 100644 cuda/timeutil.h diff --git a/.gitignore b/.gitignore index 42709bd2..809c6c87 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -gpuowl +openowl +openowl-notf cudaowl fftbench gpuowl.log diff --git a/bug/Makefile b/bug/Makefile deleted file mode 100644 index 07871749..00000000 --- a/bug/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -bug: bug.cpp *.h - g++ -o bug bug.cpp -lOpenCL -L/opt/rocm/opencl/lib/x86_64 -std=c++14 diff --git a/bug/bug.cl b/bug/bug.cl deleted file mode 100644 index 018e14ea..00000000 --- a/bug/bug.cl +++ /dev/null @@ -1,98 +0,0 @@ -typedef uint T; -typedef uint2 T2; - -// Prime M(31) == 2^31 - 1 -#define M31 0x7fffffffu - -// make a pair of Ts. -T2 U2(T a, T b) { return (T2)(a, b); } - -ulong u64(uint a) { return a; } // cast to 64 bits. - -uint lo(ulong a) { return a & 0xffffffff; } -uint up(ulong a) { return a >> 32; } - -// input 32 bits except 2^32-1; output 31 bits. -uint mod(uint x) { return x; } - -// input 63 bits; output 31 bits. -uint bigmod(ulong x) { - x = u64(2 * up(x)) + lo(x); // 'up' is 31 bits. - return mod(2 * up(x) + lo(x)); -} - -// negative: -x; input 31 bits. -uint neg(uint x) { return M31 - x; } - -uint add1(uint a, uint b) { return mod(a + b); } -uint sub1(uint a, uint b) { return add1(a, neg(b)); } - -uint2 add(uint2 a, uint2 b) { return U2(add1(a.x, b.x), add1(a.y, b.y)); } -uint2 sub(uint2 a, uint2 b) { return U2(sub1(a.x, b.x), sub1(a.y, b.y)); } - -// Input can be full 32bits. k <= 30 (i.e. mod 31). -uint shl1(uint a, uint k) { return bigmod(u64(a) << k); } - -ulong wideMul(uint a, uint b) { return u64(a) * b; } - -// The main, complex multiplication; input and output 31 bits. -// (a + i*b) * (c + i*d) mod reduced. -uint2 mul(uint2 u, uint2 v) { - uint a = u.x, b = u.y, c = v.x, d = v.y; - ulong k1 = wideMul(c, add1(a, b)); - ulong k2 = wideMul(a, sub1(d, c)); - ulong k3 = wideMul(b, neg(add1(d, c))); - // k1..k3 have at most 62 bits, so sums are at most 63 bits. - return U2(bigmod(k1 + k3), bigmod(k1 + k2)); -} - -#define X2(a, b) { T2 t = a; a = add(t, b); b = sub(t, b); } - -void fft4Core(T2 *u) { - X2(u[0], u[2]); - X2(u[1], u[3]); - X2(u[0], u[1]); - X2(u[2], u[3]); -} - -void fft8Core(T2 *u) { - for (int i = 0; i < 4; ++i) { X2(u[i], u[i + 4]); } - fft4Core(u); - fft4Core(u + 4); -} - -void fft8(T2 *u) { - fft8Core(u); -} - -void tabMul(const T2 *trig, T2 *u, uint n) { - uint me = get_local_id(0); - for (int i = 1; i < n; ++i) { u[i] = mul(u[i], trig[me / 8 + i * 8]); } -} - -#define WG 64 - -kernel __attribute__((reqd_work_group_size(WG, 1, 1))) void bug(global T2 *in, global T2 *out, global T2 *trig) { - local uint lds[8 * 256]; - uint me = get_local_id(0); - T2 u[8]; - for (int i = 0; i < 8; ++i) { u[i] = in[WG * i + me]; } - fft8(u); - - uint n = 8; - uint f = 8; - uint m = me / f; - - for (uint i = 0; i < n; ++i) { lds[(m + i * WG / f) / n * f + m % n * WG + me % f] = u[i].x; } - barrier(CLK_LOCAL_MEM_FENCE); - for (uint i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - barrier(CLK_LOCAL_MEM_FENCE); - for (uint i = 0; i < n; ++i) { lds[(m + i * WG / f) / n * f + m % n * WG + me % f] = u[i].y; } - barrier(CLK_GLOBAL_MEM_FENCE); - for (uint i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } - - // mem_fence(CLK_LOCAL_MEM_FENCE); // Comment or un-comment this to observe different behavior. - - tabMul(trig, u, 8); - for (int i = 0; i < 8; ++i) { out[WG * i + me] = u[i]; } -} diff --git a/bug/bug.cpp b/bug/bug.cpp deleted file mode 100644 index c3c11b47..00000000 --- a/bug/bug.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "clwrap.h" - -void log(const char *fmt, ...) { - va_list va; - va_start(va, fmt); - vprintf(fmt, va); - va_end(va); -} - - -int main() { - cl_device_id device = 0; - getDeviceIDs(true, 1, &device); - - cl_context context = createContext(device); - cl_queue queue = makeQueue(device, context); - cl_program program = compile(device, context, "bug.cl", "-save-temps=tmp"); - cl_kernel kernel = makeKernel(program, "bug"); - int size = sizeof(int) * 2 * 256 * 8; - cl_mem buf1 = makeBuf(context, CL_MEM_READ_WRITE, size); - cl_mem buf2 = makeBuf(context, CL_MEM_READ_WRITE, size); - cl_mem trig = makeBuf(context, CL_MEM_READ_WRITE, size); - - setArg(kernel, 0, buf1); - setArg(kernel, 1, buf2); - setArg(kernel, 2, trig); - - int *data = new int[256 * 2 * 8](); - data[0] = 1; - write(queue, true, buf1, size, data); - - for (int i = 0; i < 256 * 8; ++i) { - data[2*i] = 0x49fb5248; - data[2*i+1] = 0x46515668; - } - write(queue, true, trig, size, data); - - run(queue, kernel, 64, "bug", 64); - read(queue, true, buf2, size, data); - for (int thread = 0; thread < 33; ++thread) { - for (int i = 0; i < 8; ++i) { - printf("%d %d: %8x %8x\n", thread, i, data[(thread + i * 64) * 2], data[(thread + i * 64) * 2 + 1]); - } - } -} diff --git a/bug/clwrap.h b/bug/clwrap.h deleted file mode 100644 index b878fe26..00000000 --- a/bug/clwrap.h +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (C) 2017 Mihai Preda. - -#pragma once - -#include "tinycl.h" -#include "timeutil.h" - -#include -#include -#include - -#include -#include - -using std::string; -using std::vector; - -bool check(int err, const char *mes = nullptr) { - bool ok = (err == CL_SUCCESS); - if (!ok) { - if (mes) { - log("error %d (%s)\n", err, mes); - } else { - log("error %d\n", err); - } - } - return ok; -} - -#define CHECK(what) assert(check(what)); -#define CHECK2(what, mes) assert(check(what, mes)); - -void getInfo(cl_device_id id, int what, size_t bufSize, void *buf) { - size_t outSize = 0; - CHECK(clGetDeviceInfo(id, what, bufSize, buf, &outSize)); - assert(outSize <= bufSize); -} - -bool getInfoMaybe(cl_device_id id, int what, size_t bufSize, void *buf) { - return clGetDeviceInfo(id, what, bufSize, buf, NULL) == CL_SUCCESS; -} - -bool getTopology(cl_device_id id, size_t bufSize, char *buf) { - cl_device_topology_amd top; - if (!getInfoMaybe(id, CL_DEVICE_TOPOLOGY_AMD, sizeof(top), &top)) { return false; } - snprintf(buf, bufSize, "%x:%u.%u", - (unsigned) (unsigned char) top.pcie.bus, (unsigned) top.pcie.device, (unsigned) top.pcie.function); - return true; -} - -int getDeviceIDs(bool onlyGPU, size_t size, cl_device_id *out) { - cl_platform_id platforms[8]; - unsigned nPlatforms; - CHECK(clGetPlatformIDs(8, platforms, &nPlatforms)); - - unsigned n = 0; - for (int i = 0; i < (int) nPlatforms && size > n; ++i) { - unsigned delta = 0; - CHECK(clGetDeviceIDs(platforms[i], onlyGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_ALL, size - n, out + n, &delta)); - n += delta; - } - return n; -} - -int getNumberOfDevices() { - cl_platform_id platforms[8]; - unsigned nPlatforms; - CHECK(clGetPlatformIDs(8, platforms, &nPlatforms)); - - unsigned n = 0; - for (int i = 0; i < (int) nPlatforms; ++i) { - unsigned delta = 0; - CHECK(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &delta)); - n += delta; - } - return n; -} - -string getDeviceName(cl_device_id id) { - char boardName[64]; - bool hasBoardName = getInfoMaybe(id, CL_DEVICE_BOARD_NAME_AMD, sizeof(boardName), boardName); - - char topology[64]; - bool hasTopology = getTopology(id, sizeof(topology), topology); - - unsigned computeUnits; - getInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeUnits), &computeUnits); - - return (hasBoardName && hasTopology) ? string(boardName) + " " + std::to_string(computeUnits) + " @" + topology : ""; -} - -string getDeviceInfo(cl_device_id device) { - char name[64], version[64]; - getInfo(device, CL_DEVICE_NAME, sizeof(name), name); - getInfo(device, CL_DEVICE_VERSION, sizeof(version), version); - unsigned computeUnits, frequency; - getInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeUnits), &computeUnits); - getInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(frequency), &frequency); - - string board = getDeviceName(device); - - char info[256]; - if (!board.empty()) { - snprintf(info, sizeof(info), "%s, %s %4uMHz", board.c_str(), name, frequency); - } else { - snprintf(info, sizeof(info), "%s, %2ux%4uMHz", name, computeUnits, frequency); - } - return info; -} - -cl_context createContext(cl_device_id device) { - int err; - cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); - CHECK2(err, "clCreateContext"); - return context; -} - -typedef cl_command_queue cl_queue; - -void release(cl_context context) { CHECK(clReleaseContext(context)); } -void release(cl_program program) { CHECK(clReleaseProgram(program)); } -void release(cl_mem buf) { CHECK(clReleaseMemObject(buf)); } -void release(cl_queue queue) { CHECK(clReleaseCommandQueue(queue)); } -void release(cl_kernel k) { CHECK(clReleaseKernel(k)); } - -bool dumpBinary(cl_program program, const char *fileName) { - if (auto fo = open(fileName, "w")) { - size_t size; - CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size), &size, NULL)); - char *buf = new char[size + 1]; - CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(&buf), &buf, NULL)); - fwrite(buf, 1, size, fo.get()); - delete[] buf; - return true; - } - return false; -} - -static cl_program createProgram(cl_device_id device, cl_context context, const string &fileName) { - string stub = string("#include \"") + fileName + "\"\n"; - - const char *ptr = stub.c_str(); - size_t size = stub.size(); - int err; - cl_program program = clCreateProgramWithSource(context, 1, &ptr, &size, &err); - CHECK2(err, "clCreateProgram"); - return program; -} - -static bool build(cl_program program, cl_device_id device, const string &extraArgs) { - Timer timer; - string args = string("-I. -cl-fast-relaxed-math ") + extraArgs; - int err = clBuildProgram(program, 1, &device, args.c_str(), NULL, NULL); - bool ok = (err == CL_SUCCESS); - if (!ok) { log("OpenCL compilation error %d (args %s)\n", err, args.c_str()); } - - size_t logSize; - clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); - if (logSize) { - std::unique_ptr buf(new char[logSize + 1]); - clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, buf.get(), &logSize); - buf.get()[logSize] = 0; - log("%s\n", buf.get()); - } - if (ok) { - log("OpenCL compilation in %d ms, with \"%s\"\n", timer.deltaMillis(), args.c_str()); - } - return ok; -} - -string join(const string &prefix, const vector &elems) { - string big = ""; - for (auto s : elems) { big += prefix + s; } - return big; -} - -cl_program compile(cl_device_id device, cl_context context, const string &fileName, const string &extraArgs, - const vector &defVect = {}) { - cl_program program = createProgram(device, context, fileName); - if (!program) { return program; } - - string args = join(" -D", defVect) + " " + extraArgs; - bool tryCL20 = true; - if ((tryCL20 && build(program, device, string("-cl-std=CL2.0 ") + args)) - || build(program, device, args)) { - return program; - } else { - release(program); - return 0; - } -} - // Other options: - // * to output GCN ISA: -save-temps or -save-temps=prefix or -save-temps=folder/ - // * to disable all OpenCL optimization (do not use): -cl-opt-disable - // * -cl-uniform-work-group-size - // * -fno-bin-llvmir - // * various: -fno-bin-source -fno-bin-amdil - - -cl_kernel makeKernel(cl_program program, const char *name) { - int err; - cl_kernel k = clCreateKernel(program, name, &err); - CHECK2(err, name); - return k; -} - -void setArg(cl_kernel k, int pos, const auto &value) { CHECK(clSetKernelArg(k, pos, sizeof(value), &value)); } - -cl_mem makeBuf(cl_context context, unsigned kind, size_t size, const void *ptr = 0) { - int err; - cl_mem buf = clCreateBuffer(context, kind, size, (void *) ptr, &err); - CHECK2(err, "clCreateBuffer"); - return buf; -} - -cl_queue makeQueue(cl_device_id d, cl_context c) { - int err; - cl_queue q = clCreateCommandQueue(c, d, 0, &err); - CHECK2(err, "clCreateCommandQueue"); - return q; -} - -void flush( cl_queue q) { CHECK(clFlush(q)); } -void finish(cl_queue q) { CHECK(clFinish(q)); } - -void run(cl_queue queue, cl_kernel kernel, size_t workSize, const string &name, size_t groupSize = 256) { - CHECK2(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &workSize, &groupSize, 0, NULL, NULL), name.c_str()); -} - -void read(cl_queue queue, bool blocking, cl_mem buf, size_t size, void *data, size_t start = 0) { - CHECK(clEnqueueReadBuffer(queue, buf, blocking, start, size, data, 0, NULL, NULL)); -} - -void write(cl_queue queue, bool blocking, cl_mem buf, size_t size, const void *data, size_t start = 0) { - CHECK(clEnqueueWriteBuffer(queue, buf, blocking, start, size, data, 0, NULL, NULL)); -} - -// void copyBuf(cl_queue q, cl_mem src, cl_mem dst, size_t size) { CHECK(clEnqueueCopyBuffer(q, src, dst, 0, 0, size, 0, nullptr, nullptr)); } diff --git a/bug/common.h b/bug/common.h deleted file mode 100644 index af81eecb..00000000 --- a/bug/common.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include -#include - -typedef unsigned char byte; -typedef long long i64; -typedef unsigned long long u64; -typedef int i32; -typedef unsigned u32; - -static_assert(sizeof(u32) == 4, "size u32"); -static_assert(sizeof(u64) == 8, "size u64"); - -#ifdef __GNUC__ -void log(const char *fmt, ...) __attribute__ ((format(printf, 1, 2))); -#else -void log(const char *fmt, ...); -#endif - -namespace std { -template<> struct default_delete { - void operator()(FILE *f) { - // fprintf(stderr, "file closed\n"); - if (f != nullptr) { fclose(f); } - } - }; -} - -std::unique_ptr open(const char *name, const char *mode, bool doLog = true) { - std::unique_ptr f{fopen(name, mode)}; - if (!f && doLog) { log("Can't open '%s' (mode '%s')\n", name, mode); } - return f; -} diff --git a/bug/timeutil.h b/bug/timeutil.h deleted file mode 100644 index 62165381..00000000 --- a/bug/timeutil.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (C) 2017 Mihai Preda. - -#pragma once - -#include "common.h" -#include - -class Timer { - u64 prev; - - static u64 timeMicros() { - struct timeval tv; - gettimeofday(&tv, 0); - return tv.tv_sec * 1000000UL + tv.tv_usec; - } - - public: - Timer() : prev(timeMicros()) { } - - u64 deltaMicros() { - u64 now = timeMicros(); - u64 delta = now - prev; - prev = now; - return delta; - } - - int deltaMillis() { return (int) (deltaMicros() / 1000); } -}; diff --git a/bug/tinycl.h b/bug/tinycl.h deleted file mode 100644 index 337f3751..00000000 --- a/bug/tinycl.h +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (C) 2017 Mihai Preda. - -#include "common.h" - -#include -#include - -typedef struct _cl_platform_id * cl_platform_id; -typedef struct _cl_device_id * cl_device_id; -typedef struct _cl_context * cl_context; -typedef struct _cl_command_queue * cl_command_queue; -typedef struct _cl_mem * cl_mem; -typedef struct _cl_program * cl_program; -typedef struct _cl_kernel * cl_kernel; -typedef struct _cl_event * cl_event; -typedef struct _cl_sampler * cl_sampler; - -typedef unsigned cl_bool; -typedef unsigned cl_program_build_info; -typedef unsigned cl_program_info; -typedef unsigned cl_device_info; - -typedef u64 cl_mem_flags; -typedef u64 cl_device_type; -typedef u64 cl_queue_properties; - -extern "C" { - -unsigned clGetPlatformIDs(unsigned, cl_platform_id *, unsigned *); -int clGetDeviceIDs(cl_platform_id, cl_device_type, unsigned, cl_device_id *, unsigned *); -cl_context clCreateContext(const intptr_t *, unsigned, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, int *); -int clReleaseContext(cl_context); -int clReleaseProgram(cl_program); -int clReleaseCommandQueue(cl_command_queue); -int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, unsigned, const size_t *, const size_t *, const size_t *, unsigned, const cl_event *, cl_event *); -cl_program clCreateProgramWithSource(cl_context, unsigned, const char **, const size_t *, int *); -cl_program clCreateProgramWithBinary(cl_context, unsigned, const cl_device_id *, const size_t *, const unsigned char **, int *, int *); -int clBuildProgram(cl_program, unsigned, const cl_device_id *, const char *, void (*)(cl_program, void *), void *); -int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); -int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *); -int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *); -int clGetPlatformInfo(cl_platform_id, cl_device_info, size_t, void *, size_t *); - -cl_kernel clCreateKernel(cl_program, const char *, int *); -int clReleaseKernel(cl_kernel); -cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, int *); -int clReleaseMemObject(cl_mem); -cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, const cl_queue_properties *, int *); - -int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, - unsigned numEvents, const cl_event *waitEvents, cl_event *outEvent); -int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, - unsigned numEvent, const cl_event *waitEvents, cl_event *outEvent); - -int clFlush(cl_command_queue); -int clFinish(cl_command_queue); -int clSetKernelArg(cl_kernel, unsigned, size_t, const void *); - -int clReleaseEvent(cl_event); -int clWaitForEvents(unsigned numEvents, const cl_event *); - -} - -#define CL_SUCCESS 0 -#define CL_DEVICE_TYPE_GPU (1 << 2) -#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF -#define CL_PLATFORM_VERSION 0x0901 -#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 -#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C -#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 -#define CL_DEVICE_NAME 0x102B -#define CL_DEVICE_VERSION 0x102F -#define CL_DRIVER_VERSION 0x102D -#define CL_DEVICE_BUILT_IN_KERNELS 0x103F -#define CL_PROGRAM_BINARY_SIZES 0x1165 -#define CL_PROGRAM_BINARIES 0x1166 -#define CL_PROGRAM_BUILD_LOG 0x1183 - -#define CL_MEM_READ_WRITE (1 << 0) -#define CL_MEM_WRITE_ONLY (1 << 1) -#define CL_MEM_READ_ONLY (1 << 2) - -#define CL_MEM_USE_HOST_PTR (1 << 3) -#define CL_MEM_ALLOC_HOST_PTR (1 << 4) -#define CL_MEM_COPY_HOST_PTR (1 << 5) - -#define CL_MEM_HOST_WRITE_ONLY (1 << 7) -#define CL_MEM_HOST_READ_ONLY (1 << 8) -#define CL_MEM_HOST_NO_ACCESS (1 << 9) - -#define CL_INVALID_COMPILER_OPTIONS -66 - - -// AMD -#define CL_DEVICE_TOPOLOGY_AMD 0x4037 -#define CL_DEVICE_BOARD_NAME_AMD 0x4038 - -typedef union -{ - struct { u32 type; u32 data[5]; } raw; - struct { u32 type; char unused[17]; char bus; char device; char function; } pcie; -} cl_device_topology_amd; diff --git a/cuda/timeutil.h b/cuda/timeutil.h deleted file mode 100644 index 1edbaffe..00000000 --- a/cuda/timeutil.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (C) 2017 Mihai Preda. - -#pragma once - -#include - -typedef unsigned long long u64; - -class Timer { - u64 prev; - - static u64 timeMicros() { - struct timeval tv; - gettimeofday(&tv, 0); - return u64(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - public: - Timer() : prev(timeMicros()) { } - - u64 deltaMicros() { - u64 now = timeMicros(); - u64 delta = now - prev; - prev = now; - return delta; - } - - int deltaMillis() { return (int) (deltaMicros() / 1000); } -}; diff --git a/gpuowl.cl b/gpuowl.cl index 01a5abb6..58e322b5 100644 --- a/gpuowl.cl +++ b/gpuowl.cl @@ -188,23 +188,6 @@ void fft8(T2 *u) { SWAP(u[3], u[6]); } -/* -void fft8(T2 *u) { - for (int i = 0; i < 4; ++i) { X2(u[i], u[i + 4]); } - u[6] = mul_t4(u[6]); - - X2(u[0], u[2]); - X2(u[1], u[3]); - u[3] = mul_t4(u[3]); - - X2(u[5], u[7]); - u[5] = mul_t4(u[5]) * M_SQRT1_2; - u[7] = u[7] * M_SQRT1_2; - - X2(u[0], u[1]); -} -*/ - // Adapted from: Nussbaumer, "Fast Fourier Transform and Convolution Algorithms", 5.5.4 "5-Point DFT". void fft5(T2 *u) { const double SIN1 = 0x1.e6f0e134454ffp-1; // sin(tau/5), 0.95105651629515353118