Skip to content

Commit

Permalink
Implemented double-wide's trig options for single-wide.
Browse files Browse the repository at this point in the history
Implemented double-wide's single kernel feature for single-wide.
Simplified #defines from three down to two.
Best options will depend on openCl optimizer and resulting occupancy.
  • Loading branch information
gwoltman authored and preda committed Dec 14, 2024
1 parent 7b750df commit 4dc58d0
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 20 deletions.
15 changes: 9 additions & 6 deletions src/Gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,14 +409,17 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>&
K(fftHin, "ffthin.cl", "fftHin", hN / nH),
K(tailSquareZero, "tailsquare.cl", "tailSquareZero", SMALL_H / nH * 2),

#if DOUBLE_WIDE
// Two double-wide kernels
#if !SINGLE_WIDE && !SINGLE_KERNEL
// Double-wide tailSquare with two kernels
K(tailSquare, "tailsquare.cl", "tailSquare", hN / nH - SMALL_H / nH * 2),
#elif DOUBLE_WIDE_ONEK
// One double-wide kernel
#elif !SINGLE_WIDE
// Double-wide tailSquare with one kernel
K(tailSquare, "tailsquare.cl", "tailSquare", hN / nH),
#elif !SINGLE_KERNEL
// Single-wide tailSquare with two kernels
K(tailSquare, "tailsquare.cl", "tailSquare", hN / nH / 2 - SMALL_H / nH),
#else
// Old-style single-wide kernel
// Single-wide tailSquare with one kernel
K(tailSquare, "tailsquare.cl", "tailSquare", hN / nH / 2),
#endif

Expand Down Expand Up @@ -827,7 +830,7 @@ static bool testBit(u64 x, int bit) { return x & (u64(1) << bit); }

void Gpu::bottomHalf(Buffer<double>& out, Buffer<double>& inTmp) {
fftMidIn(out, inTmp);
#if DOUBLE_WIDE
#if !SINGLE_KERNEL
tailSquareZero(inTmp, out);
#endif
tailSquare(inTmp, out);
Expand Down
5 changes: 2 additions & 3 deletions src/Gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@

// Klunky defines for single-wide vs. double-wide tailSquare
// Clean this up once we determine which options to make user visible
#define SINGLE_WIDE 0 // Old single-wide tailSquare
#define DOUBLE_WIDE_ONEK 0 // Double-wide tailSquare in a single kernel
#define DOUBLE_WIDE 1 // Double-wide tailSquare in two kernels
#define SINGLE_WIDE 0 // Old single-wide tailSquare vs. new double-wide tailSquare
#define SINGLE_KERNEL 0 // Implement tailSquare in a single kernel vs. two kernels

struct PRPResult;
struct Task;
Expand Down
11 changes: 8 additions & 3 deletions src/TrigBufCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
//#define PREFER_DP_TO_MEM 1 // Good DP GPU. Tuned for Radeon VII.
//#define PREFER_DP_TO_MEM 0 // Poor DP GPU. A typical consumer grade GPU.

// Klunky defines for single-wide vs. double-wide tailSquare
// Clean this up once we determine which options to make user visible
#define SINGLE_WIDE 0 // Old single-wide tailSquare vs. new double-wide tailSquare
#define SINGLE_KERNEL 0 // Implement tailSquare in a single kernel vs. two kernels

#define _USE_MATH_DEFINES
#include <cmath>

Expand Down Expand Up @@ -127,15 +132,15 @@ vector<double2> genSmallTrigCombo(u32 width, u32 middle, u32 size, u32 radix) {
for (u32 me = 0; me < height / radix; ++me) {
tab.push_back(root1(width * middle * height, width * middle * me));
}
// Output the two T2 multipliers to be read by one u,v pair of lines
// Output the one or two T2 multipliers to be read by one u,v pair of lines
for (u32 line = 0; line < width * middle / 2; ++line) {
tab.push_back(root1Fancy(width * middle * height, line));
tab.push_back(root1Fancy(width * middle * height, width * middle - line));
if (!SINGLE_WIDE) tab.push_back(root1Fancy(width * middle * height, width * middle - line));
}
#else
u32 height = size;
for (u32 u = 0; u < width * middle / 2; ++u) {
for (u32 v = 0; v < 2; ++v) {
for (u32 v = 0; v < (SINGLE_WIDE ? 1 : 2); ++v) {
u32 line = (v == 0) ? u : width * middle - u;
for (u32 me = 0; me < height / radix; ++me) {
tab.push_back(root1(width * middle * height, line + width * middle * me));
Expand Down
47 changes: 39 additions & 8 deletions src/cl/tailsquare.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#if !defined(SINGLE_WIDE)
#define SINGLE_WIDE 0 // Old single-wide tailSquare vs. new double-wide tailSquare
#endif
#define DOUBLE_WIDE_ONEK 0 // Double-wide tailSquare in a single kernel
#define SINGLE_KERNEL 0 // Implement tailSquare in a single kernel vs. two kernels

// Why does this alternate implementation work? Let t' be the conjugate of t and note that t*t' = 1.
// Now consider these lines from the original implementation (comments appear alongside):
Expand Down Expand Up @@ -104,8 +104,13 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {

u32 H = ND / SMALL_HEIGHT;

#if SINGLE_KERNEL
u32 line1 = get_group_id(0);
u32 line2 = line1 ? H - line1 : (H / 2);
#else
u32 line1 = get_group_id(0) + 1;
u32 line2 = H - line1;
#endif
u32 memline1 = transPos(line1, MIDDLE, WIDTH);
u32 memline2 = transPos(line2, MIDDLE, WIDTH);

Expand All @@ -123,13 +128,31 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
bar();
fft_HEIGHT(lds, v, smallTrig, w);

// Compute trig value from scratch. Good on GPUs with high DP throughput.
#if PREFER_DP_TO_MEM >= 2
T2 trig = slowTrig_N(line1 + me * H, ND / NH);

if (line1) {
reverseLine(G_H, lds, v);
pairSq(NH, u, v, trig, false);
reverseLine(G_H, lds, v);
} else {
// Do a little bit of memory access and a little bit of DP math. Good on a Radeon VII.
#elif PREFER_DP_TO_MEM == 1
// Calculate number of trig values used by fft_HEIGHT.
// The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
// Read a hopefully cached line of data and one non-cached T2 per line
T2 trig = smallTrig[height_trigs + me]; // Trig values for line zero, should be cached
T2 mult = smallTrig[height_trigs + G_H + line1]; // Line multiplier
trig = cmulFancy(trig, mult);

// On consumer-grade GPUs, it is likely beneficial to read all trig values.
#else
// Calculate number of trig values used by fft_HEIGHT.
// The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
// Read pre-computed trig values
T2 trig = smallTrig[height_trigs + line1*G_H + me];
#endif

#if SINGLE_KERNEL
if (line1 == 0) {
// Line 0 is special: it pairs with itself, offseted by 1.
reverse(G_H, lds, u + NH/2, true);
pairSq(NH/2, u, u + NH/2, trig, true);
Expand All @@ -141,6 +164,14 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
pairSq(NH/2, v, v + NH/2, trig2, false);
reverse(G_H, lds, v + NH/2, false);
}
else {
#else
if (1) {
#endif
reverseLine(G_H, lds, v);
pairSq(NH, u, v, trig, false);
reverseLine(G_H, lds, v);
}

bar();
fft_HEIGHT(lds, v, smallTrig, w);
Expand Down Expand Up @@ -181,7 +212,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {

u32 H = ND / SMALL_HEIGHT;

#if DOUBLE_WIDE_ONEK
#if SINGLE_KERNEL
u32 line_u = get_group_id(0);
u32 line_v = line_u ? H - line_u : (H / 2);
#else
Expand Down Expand Up @@ -233,7 +264,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {

bar(G_H);

#if DOUBLE_WIDE_ONEK
#if SINGLE_KERNEL
// Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
if (line_u == 0) {
reverse2(lds, u);
Expand Down

0 comments on commit 4dc58d0

Please sign in to comment.