preda · preda · Nov 22, 2024 · Nov 21, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/src/Gpu.cpp b/src/Gpu.cpp
@@ -252,11 +252,13 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
   defines += toDefine("TRIG_SIN",  coefs.sinCoefs);
   defines += toDefine("TRIG_COS",  coefs.cosCoefs);
 
+  // Calculate fractional bits-per-word = (E % N) / N * 2^64
   u32 bpw_hi = (u64(E % N) << 32) / N;
   u32 bpw_lo = (((u64(E % N) << 32) % N) << 32) / N;
-  defines += toDefine("FRAC_BPW_HI", bpw_hi);
-  defines += toDefine("FRAC_BPW_LO", bpw_lo);
   u64 bpw = (u64(bpw_hi) << 32) + bpw_lo;
+  bpw--; // bpw must not be an exact value -- it must be less than exact value to get last biglit value right
+  defines += toDefine("FRAC_BPW_HI", (u32) (bpw >> 32));
+  defines += toDefine("FRAC_BPW_LO", (u32) bpw);
   u32 bigstep = (bpw * (N / fft.shape.nW())) >> 32;
   defines += toDefine("FRAC_BITS_BIGSTEP", bigstep);
 
@@ -458,9 +460,9 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>&
   BUF(bufROE, ROE_SIZE),
   BUF(bufStatsCarry, CARRY_SIZE),
 
-  BUF(buf1, N),
-  BUF(buf2, N),
-  BUF(buf3, N),
+  BUF(buf1, N + N/4),		// Let's us play with padding instead of rotating.  Need to calculate actual cost of padding
+  BUF(buf2, N + N/4),
+  BUF(buf3, N + N/4),
 #undef BUF
 
   statsBits{u32(args.value("STATS", 0))},

diff --git a/src/cl/base.cl b/src/cl/base.cl
@@ -151,117 +151,15 @@ void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base);
 void bar(void);
 
 void read(u32 WG, u32 N, T2 *u, const global T2 *in, u32 base) {
-  for (u32 i = 0; i < N; ++i) { u[i] = in[base + i * WG + (u32) get_local_id(0)]; }
+  in += base + (u32) get_local_id(0);
+  for (u32 i = 0; i < N; ++i) { u[i] = in[i * WG]; }
 }
 
 void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base) {
   out += base + (u32) get_local_id(0);
   for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
 }
 
-// Parameters we may want to let user tune.  WIDTH other than 512 and 1K is untested.  SMALL_HEIGHT other than 256 and 512 is untested.
-#define ROTATION 1                              // Turns on rotating width and small_height rows
-#define WIDTH_ROTATE_CHUNK_SIZE 32              // Rotate blocks of 32 T2 values = 512 bytes
-#define HEIGHT_ROTATE_CHUNK_SIZE 16             // Rotate blocks of 16 T2 values = 256 bytes
-#define VARIABLE_WIDTH_ROTATE 0                 // Each width u[i] gets a different rotation amount
-#define MIDDLE_SHUFFLE_WRITE 1                  // Radeon VII likes MiddleShuffleWrite, Titan V apparently not
-
-// nVidia Titan V hates rotating and LDS-less middle writes
-#if !AMDGPU
-#undef ROTATION
-#define ROTATION 0
-#undef MIDDLE_SHUFFLE_WRITE
-#define MIDDLE_SHUFFLE_WRITE 0
-#endif
-
-// Rotate width elements on output from fft_WIDTH and as input to fftMiddleIn.
-// Not all lines are rotated the same amount so that fftMiddleIn reads a more varied distribution of addresses.
-// This can be faster on AMD GPUs, not certain about nVidia GPUs.
-u32 rotate_width_amount(u32 y) {
-#if !VARIABLE_WIDTH_ROTATE
-  u32 num_sections = WIDTH / WIDTH_ROTATE_CHUNK_SIZE;
-  u32 num_rotates = y % num_sections;           // if y increments by SMALL_HEIGHT, final rotate amount won't change after applying "mod WIDTH"
-#else
-  // Create a formula where each u[i] gets a different rotation amount
-  u32 num_sections = WIDTH / WIDTH_ROTATE_CHUNK_SIZE;
-  u32 num_rotates = y % num_sections * MIDDLE;  // each increment of y adds MIDDLE
-  num_rotates += y / SMALL_HEIGHT;              // each increment of i in u[i] will add 1
-  num_rotates &= 255;                           // keep num_rotates small (probably not necessary)
-#endif
-  return num_rotates * WIDTH_ROTATE_CHUNK_SIZE;
-}
-u32 rotate_width_x(u32 x, u32 rot_amt) {        // rotate x coordinate using a cached rotate_amount
-  return (x + rot_amt) % WIDTH;
-}
-u32 rotate_width(u32 y, u32 x) {                // rotate x coordinate (no cached rotate amount)
-  return rotate_width_x(x, rotate_width_amount(y));
-}
-void readRotatedWidth(T2 *u, CP(T2) in, u32 y, u32 x) {
-#if !ROTATION                                   // No rotation, might be better on nVidia cards
-  in += y * WIDTH + x;
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
-#elif !VARIABLE_WIDTH_ROTATE                    // True if adding SMALL_HEIGHT to y results in same rotation amount
-  in += y * WIDTH + rotate_width(y, x);
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
-#else                                           // Adding SMALL_HEIGHT to y results in different rotation
-  in += y * WIDTH;
-  u32 rot_amt = rotate_width_amount(y);
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[rotate_width_x(x, rot_amt)]; in += SMALL_HEIGHT * WIDTH; rot_amt += SMALL_HEIGHT * WIDTH_ROTATE_CHUNK_SIZE; }
-#endif
-}
-void writeRotatedWidth(u32 WG, u32 N, T2 *u, P(T2) out, u32 line) {
-#if !ROTATION                                   // No rotation, might be better on nVidia cards
-  out += line * WIDTH + (u32) get_local_id(0);
-  for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
-#else
-  u32 me = (u32) get_local_id(0);
-  u32 rot_amt = rotate_width_amount(line);
-  out += line * WIDTH;
-  for (u32 i = 0; i < N; ++i) { out[rotate_width_x (i * WG + me, rot_amt)] = u[i]; }
-#endif
-}
-
-// Rotate height elements on output from fft_HEIGHT and as input to fftMiddleOut.
-// Not all lines are rotated the same amount so that fftMiddleOut reads a more varied distribution of addresses.
-// This can be faster on AMD GPUs, not certain about nVidia GPUs.
-u32 rotate_height_amount (u32 y) {
-  u32 num_sections = SMALL_HEIGHT / HEIGHT_ROTATE_CHUNK_SIZE;
-  return (y % num_sections) * (HEIGHT_ROTATE_CHUNK_SIZE);
-}
-u32 rotate_height_x(u32 x, u32 rot_amt) {       // rotate x coordinate using a cached rotate_amount
-  return (x + rot_amt) % SMALL_HEIGHT;
-}
-u32 rotate_height(u32 y, u32 x) {               // rotate x coordinate (no cached rotate amount)
-  return rotate_height_x(x, rotate_height_amount(y));
-}
-void readRotatedHeight(T2 *u, CP(T2) in, u32 y, u32 x) {
-#if !ROTATION                                   // No rotation, might be better on nVidia cards
-  in += y * MIDDLE * SMALL_HEIGHT + x;
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
-#elif 0                                         // Set if adding 1 to y results in same rotation
-  y *= MIDDLE;
-  in += y * SMALL_HEIGHT + rotate_height(y, x);
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
-#else                                           // Adding SMALL_HEIGHT to line results in different rotation
-  y *= MIDDLE;
-  in += y * SMALL_HEIGHT;
-  u32 rot_amt = rotate_height_amount(y);
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[rotate_height_x(x, rot_amt)]; in += SMALL_HEIGHT; rot_amt += HEIGHT_ROTATE_CHUNK_SIZE; }
-#endif
-}
-void writeRotatedHeight(u32 WG, u32 N, T2 *u, P(T2) out, u32 line) {
-#if !ROTATION                                   // No rotation, might be better on nVidia cards
-  out += line * SMALL_HEIGHT + (u32) get_local_id(0);
-  for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
-#else
-  u32 me = (u32) get_local_id(0);
-  u32 rot_amt = rotate_height_amount(line);
-  out += line * SMALL_HEIGHT;
-  for (u32 i = 0; i < N; ++i) { out[rotate_height_x (i * WG + me, rot_amt)] = u[i]; }
-#endif
-}
-
-
 T2 U2(T a, T b) { return (T2) (a, b); }
 
 void bar() {

diff --git a/src/cl/carryfused.cl b/src/cl/carryfused.cl
@@ -62,7 +62,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   // On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather read the flags from memory.
   // On Radeon VII this code is about he same speed.  Not sure which is better on other GPUs.
 #if BIGLIT
-  u32 frac_bits = (u32) (((me * H + line) * 2 * ((((u64) FRAC_BPW_HI) << 32) + FRAC_BPW_LO)) >> 32);
+  // Calculate the most significant 32-bits of FRAC_BPW * the index of the FFT word
+  u32 fft_word_index = (me * H + line) * 2;
+  u32 frac_bits = fft_word_index * FRAC_BPW_HI + (u32) ((fft_word_index * (u64) FRAC_BPW_LO) >> 32);
+  // Perform addition to test first biglit flag
   u32 tmp = frac_bits + FRAC_BPW_HI;
 #endif
 
@@ -170,7 +173,7 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   bar();
 
   fft_WIDTH(lds, u, smallTrig);
-  writeRotatedWidth(G_W, NW, u, out, line);
+  writeCarryFusedLine(u, out, line);
 
   // Clear carry ready flag for next iteration
 #if OLD_FENCE

diff --git a/src/cl/fftmiddlein.cl b/src/cl/fftmiddlein.cl
@@ -26,21 +26,23 @@ KERNEL(IN_WG) fftMiddleIn(P(T2) out, CP(T2) in, Trig trig) {
   u32 x = startx + mx;
   u32 y = starty + my;
 
-  readRotatedWidth(u, in, y, x);
+  readMiddleInLine(u, in, y, x);
 
   middleMul2(u, x, y, 1, trig);
 
   fft_MIDDLE(u);
 
   middleMul(u, y, trig);
 
-#if !MIDDLE_SHUFFLE_WRITE
+#if MIDDLE_IN_LDS_TRANSPOSE
+  // Transpose the x and y values
   local T lds[IN_WG / 2 * (MIDDLE <= 8 ? 2 * MIDDLE : MIDDLE)];
   middleShuffle(lds, u, IN_WG, IN_SIZEX);
-  write(IN_WG, MIDDLE, u, out, gx * (BIG_HEIGHT * IN_SIZEX) + gy * (MIDDLE * IN_WG));
+  out += me;  // Threads write sequentially to memory since x and y values are already transposed
 #else
-  out += gx * (BIG_HEIGHT * IN_SIZEX) + gy * (MIDDLE * IN_WG);
-  middleShuffleWrite(out, u, IN_WG, IN_SIZEX);
+  // Adjust out pointer to effect a transpose of x and y values
+  out += mx * SIZEY + my;
 #endif
 
+  writeMiddleInLine(out, u, gy, gx);
 }
diff --git a/src/cl/fftmiddleout.cl b/src/cl/fftmiddleout.cl
@@ -30,9 +30,9 @@ KERNEL(OUT_WG) fftMiddleOut(P(T2) out, P(T2) in, Trig trig) {
   u32 x = startx + mx;
   u32 y = starty + my;
 
-  readRotatedHeight(u, in, y, x);
+  readMiddleOutLine(u, in, y, x);
 
-  middleMul(u, startx + mx, trig);
+  middleMul(u, x, trig);
 
   fft_MIDDLE(u);
 
@@ -42,16 +42,17 @@ KERNEL(OUT_WG) fftMiddleOut(P(T2) out, P(T2) in, Trig trig) {
   // number.  This may be due to roundoff errors introduced by applying inexact TWO_TO_N_8TH weights.
   double factor = 1.0 / (4 * 4 * NWORDS);
 
-  middleMul2(u, starty + my, startx + mx, factor, trig);
+  middleMul2(u, y, x, factor, trig);
 
-#if !MIDDLE_SHUFFLE_WRITE
+#if MIDDLE_OUT_LDS_TRANSPOSE
+  // Transpose the x and y values
   local T lds[OUT_WG / 2 * (MIDDLE <= 8 ? 2 * MIDDLE : MIDDLE)];
   middleShuffle(lds, u, OUT_WG, OUT_SIZEX);
-  out += MIDDLE * WIDTH * OUT_SIZEX * gx + MIDDLE * OUT_WG * gy + me;
-  for (i32 i = 0; i < MIDDLE; ++i) { out[OUT_WG * i] = u[i]; }
+  out += me;  // Threads write sequentially to memory since x and y values are already transposed
 #else
-  out += MIDDLE * WIDTH * OUT_SIZEX * gx + MIDDLE * OUT_WG * gy;
-  middleShuffleWrite(out, u, OUT_WG, OUT_SIZEX);
+  // Adjust out pointer to effect a transpose of x and y values
+  out += mx * SIZEY + my;
 #endif
 
+  writeMiddleOutLine(out, u, gy, gx);
 }
diff --git a/src/cl/fftp.cl b/src/cl/fftp.cl
@@ -4,6 +4,7 @@
 #include "math.cl"
 #include "weight.cl"
 #include "fftwidth.cl"
+#include "middle.cl"
 
 // fftPremul: weight words with IBDWT weights followed by FFT-width.
 KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) {
@@ -28,5 +29,5 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)
 
   fft_WIDTH(lds, u, smallTrig);
 
-  writeRotatedWidth(G_W, NW, u, out, g);
+  writeCarryFusedLine(u, out, g);
 }