diff --git a/src/cl/base.cl b/src/cl/base.cl index c046f67..92db740 100644 --- a/src/cl/base.cl +++ b/src/cl/base.cl @@ -148,7 +148,8 @@ typedef global const double2* BigTab; // Prototypes void read(u32 WG, u32 N, T2 *u, const global T2 *in, u32 base); void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base); -void bar(void); +OVERLOAD void bar(); +OVERLOAD void bar(u32); void read(u32 WG, u32 N, T2 *u, const global T2 *in, u32 base) { in += base + (u32) get_local_id(0); @@ -162,17 +163,6 @@ void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base) { T2 U2(T a, T b) { return (T2) (a, b); } -void bar() { - // barrier(CLK_LOCAL_MEM_FENCE) is correct, but it turns out that on some GPUs - // (in particular on Radeon VII and Radeon PRO VII) barrier(0) works as well and is faster. - // So allow selecting the faster path when it works with -use FAST_BARRIER -#if FAST_BARRIER - barrier(0); -#else - barrier(CLK_LOCAL_MEM_FENCE); -#endif -} - // On "classic" AMD GCN GPUs such as Radeon VII, the wavefront size was always 64. On RDNA GPUs the wavefront can // be configured to be either 64 or 32. We use the FAST_BARRIER define as an indicator for GCN GPUs. // On Nvidia GPUs the wavefront size is 32. @@ -183,3 +173,20 @@ void bar() { #define WAVEFRONT 32 #endif #endif + +OVERLOAD void bar() { + // barrier(CLK_LOCAL_MEM_FENCE) is correct, but it turns out that on some GPUs + // (in particular on Radeon VII and Radeon PRO VII) barrier(0) works as well and is faster. + // So allow selecting the faster path when it works with -use FAST_BARRIER +#if FAST_BARRIER + barrier(0); +#else + barrier(CLK_LOCAL_MEM_FENCE); +#endif +} + +OVERLOAD void bar(u32 WG) { if (WG > WAVEFRONT) { bar(); } } + +// A half-barrier is only needed when half-a-workgroup needs a barrier. +// This is used e.g. by the double-wide tailSquare, where LDS is split between the halves. +void halfBar() { if (get_enqueued_local_size(0) / 2 > WAVEFRONT) { bar(); } } diff --git a/src/cl/fftbase.cl b/src/cl/fftbase.cl index 4b0ef65..082edd3 100644 --- a/src/cl/fftbase.cl +++ b/src/cl/fftbase.cl @@ -70,16 +70,6 @@ void shufl(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { u32 me = get_local_id(0); local T* lds = (local T*) lds2; -#if 0 - // This also works for *f* that is not a power of two. - for (u32 i = 0; i < n; ++i) { lds[i * f + (me / f) * f * n + me % f] = u[i].x; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } - bar(); - for (u32 i = 0; i < n; ++i) { lds[i * f + (me / f) * f * n + me % f] = u[i].y; } - bar(); - for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -#else u32 mask = f - 1; assert((mask & (mask + 1)) == 0); for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } @@ -89,7 +79,6 @@ void shufl(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } bar(); for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } -#endif } // Shufl two simultaneous FFT_HEIGHTs. Needed for tailSquared where u and v are computed simultaneously in different threads. @@ -101,32 +90,21 @@ void shufl2(u32 WG, local T2 *lds2, T2 *u, u32 n, u32 f) { // Partition lds memory into upper and lower halves assert(WG == G_H); - lds2 += (me / WG) * SMALL_HEIGHT; // Accessing lds memory as doubles is faster than T2 accesses - local T* lds = (local T*) lds2; + local T* lds = ((local T*) lds2) + (me / WG) * SMALL_HEIGHT; me = me % WG; -#if 0 - // This also works for *f* that is not a power of two. - for (u32 i = 0; i < n; ++i) { - u32 idx = i * f + (me / f) * f * n + me % f; - lds[idx] = u[i].x; - lds[SMALL_HEIGHT + idx] = u[i].y; - } - if (WG > WAVEFRONT) bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; u[i].y = lds[SMALL_HEIGHT + i * WG + me]; } -#else u32 mask = f - 1; assert((mask & (mask + 1)) == 0); - for (u32 i = 0; i < n; ++i) { - u32 idx = i * f + (me & ~mask) * n + (me & mask); - lds[idx] = u[i].x; - lds[SMALL_HEIGHT + idx] = u[i].y; - } - if (WG > WAVEFRONT) bar(); - for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; u[i].y = lds[SMALL_HEIGHT + i * WG + me]; } -#endif + + for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].x; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].x = lds[i * WG + me]; } + bar(WG); + for (u32 i = 0; i < n; ++i) { lds[i * f + (me & ~mask) * n + (me & mask)] = u[i].y; } + bar(WG); + for (u32 i = 0; i < n; ++i) { u[i].y = lds[i * WG + me]; } } diff --git a/src/cl/fftheight.cl b/src/cl/fftheight.cl index c7fcd40..70e75ab 100644 --- a/src/cl/fftheight.cl +++ b/src/cl/fftheight.cl @@ -42,7 +42,7 @@ void fft_HEIGHT(local T2 *lds, T2 *u, Trig trig, T2 w) { void fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) { u32 WG = SMALL_HEIGHT / NH; for (u32 s = 1; s < SMALL_HEIGHT / NH; s *= NH) { - if (/* WG > WAVEFRONT && */ s > 1) { bar(); } + if (s > 1) { bar(WG); } fft_NH(u); w = bcast(w, s); @@ -84,7 +84,7 @@ void fft_HEIGHT2(local T2 *lds, T2 *u, Trig trig, T2 w) { #endif for (u32 s = 1; s < WG; s *= NH) { - if (s > 1) { if (WG > WAVEFRONT) bar(); } + if (s > 1) { bar(WG); } fft_NH(u); tabMul(WG, trig, u, NH, s, me % WG); shufl2(WG, lds, u, NH, s); diff --git a/src/cl/tailsquare.cl b/src/cl/tailsquare.cl index 23f6374..725927c 100644 --- a/src/cl/tailsquare.cl +++ b/src/cl/tailsquare.cl @@ -251,7 +251,7 @@ void pairSq2_special(T2 *u, T2 base_squared) { } KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { - local T2 lds[SMALL_HEIGHT*2]; // change reverse line to halve this + local T2 lds[SMALL_HEIGHT]; T2 u[NH]; @@ -263,7 +263,10 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { u32 me = get_local_id(0); u32 lowMe = me % G_H; // lane-id in one of the two halves (half-workgroups). - u32 line = (me < G_H) ? line_u : line_v; + // It's not clear where up/down is, so we're going to call the halves "first-half" and "second-half". + bool isSecondHalf = me >= G_H; + + u32 line = !isSecondHalf ? line_u : line_v; // Read lines u and v readTailFusedLine(in, u, line, lowMe); @@ -278,12 +281,17 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) { T2 trig = slowTrig_N(line + H * lowMe, ND / NH * 2); - revSwapLine(G_H, lds, u + NH/2, NH/2); + bar(G_H); + + revCrossLine(G_H, lds, u + NH/2, NH/2, isSecondHalf); pairSq(NH/2, u, u + NH/2, trig, false); - revSwapLine(G_H, lds, u + NH/2, NH/2); - - // if (G_H > WAVEFRONT) bar(); - bar(); + + bar(G_H); + // We change the LDS halves we're using in order to enable half-bars + revCrossLine(G_H, lds, u + NH/2, NH/2, !isSecondHalf); + + bar(G_H); + fft_HEIGHT2(lds, u, smallTrig, w); // Write lines u and v diff --git a/src/cl/tailutil.cl b/src/cl/tailutil.cl index 467f15d..a33f280 100644 --- a/src/cl/tailutil.cl +++ b/src/cl/tailutil.cl @@ -37,21 +37,18 @@ void reverseLine(u32 WG, local T2 *lds2, T2 *u) { for (u32 i = 0; i < NH; ++i) { u[i] = lds[WG * i]; } } -void revSwapLine(u32 WG, local T2* lds2, T2 *u, u32 n) { +// This is used to reverse the second part of a line, and cross the reversed parts between the halves. +void revCrossLine(u32 WG, local T2* lds2, T2 *u, u32 n, bool writeSecondHalf) { u32 me = get_local_id(0); u32 lowMe = me % WG; - bool upHalf = me >= WG; + + u32 revLowMe = WG - 1 - lowMe; - // We are initially using LDS with the same half-discipline as the fft_HEIGHT() which precedes us, - // so only a half-bar is needed here. - // if (WG > WAVEFRONT) { bar(); } - bar(); + for (u32 i = 0; i < n; ++i) { lds2[WG * n * writeSecondHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } - u32 revLowMe = WG - 1 - lowMe; + bar(); // we need a full bar because we're crossing halves - for (u32 i = 0; i < n; ++i) { lds2[WG * n * upHalf + WG * (n - 1 - i) + revLowMe] = u[i]; } - bar(); // we need a full bar because we're going to swap halves - for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !upHalf + WG * i + lowMe]; } + for (u32 i = 0; i < n; ++i) { u[i] = lds2[WG * n * !writeSecondHalf + WG * i + lowMe]; } } // computes 2*(a.x*b.x+a.y*b.y) + i*2*(a.x*b.y+a.y*b.x)