Implemented double-wide's trig options for single-wide.

Implemented double-wide's single kernel feature for single-wide. Simplified #defines from three down to two. Best options will depend on openCl optimizer and resulting occupancy.
olympichek · Dec 14, 2024 · 4dc58d0 · 4dc58d0
1 parent 7b750df
commit 4dc58d0
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 20 deletions.
diff --git a/src/Gpu.cpp b/src/Gpu.cpp
@@ -409,14 +409,17 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>&
   K(fftHin,  "ffthin.cl",  "fftHin",  hN / nH),
   K(tailSquareZero, "tailsquare.cl", "tailSquareZero", SMALL_H / nH * 2),
 
-#if DOUBLE_WIDE
-  // Two double-wide kernels
+#if !SINGLE_WIDE && !SINGLE_KERNEL
+  // Double-wide tailSquare with two kernels
   K(tailSquare,    "tailsquare.cl", "tailSquare", hN / nH - SMALL_H / nH * 2),
-#elif DOUBLE_WIDE_ONEK   
-  // One double-wide kernel
+#elif !SINGLE_WIDE   
+  // Double-wide tailSquare with one kernel
   K(tailSquare,    "tailsquare.cl", "tailSquare", hN / nH),
+#elif !SINGLE_KERNEL
+  // Single-wide tailSquare with two kernels
+  K(tailSquare,    "tailsquare.cl", "tailSquare", hN / nH / 2 - SMALL_H / nH),
 #else
-  // Old-style single-wide kernel
+  // Single-wide tailSquare with one kernel
   K(tailSquare,    "tailsquare.cl", "tailSquare", hN / nH / 2),
 #endif
 
@@ -827,7 +830,7 @@ static bool testBit(u64 x, int bit) { return x & (u64(1) << bit); }
 
 void Gpu::bottomHalf(Buffer<double>& out, Buffer<double>& inTmp) {
   fftMidIn(out, inTmp);
-#if DOUBLE_WIDE
+#if !SINGLE_KERNEL
   tailSquareZero(inTmp, out);
 #endif
   tailSquare(inTmp, out);

diff --git a/src/Gpu.h b/src/Gpu.h
@@ -22,9 +22,8 @@
 
 // Klunky defines for single-wide vs. double-wide tailSquare
 // Clean this up once we determine which options to make user visible
-#define SINGLE_WIDE             0       // Old single-wide tailSquare
-#define DOUBLE_WIDE_ONEK        0       // Double-wide tailSquare in a single kernel
-#define DOUBLE_WIDE             1       // Double-wide tailSquare in two kernels
+#define SINGLE_WIDE             0       // Old single-wide tailSquare vs. new double-wide tailSquare
+#define SINGLE_KERNEL           0       // Implement tailSquare in a single kernel vs. two kernels
 
 struct PRPResult;
 struct Task;

diff --git a/src/TrigBufCache.cpp b/src/TrigBufCache.cpp
@@ -9,6 +9,11 @@
 //#define PREFER_DP_TO_MEM      1               // Good DP GPU.  Tuned for Radeon VII.
 //#define PREFER_DP_TO_MEM      0               // Poor DP GPU.  A typical consumer grade GPU.
 
+// Klunky defines for single-wide vs. double-wide tailSquare
+// Clean this up once we determine which options to make user visible
+#define SINGLE_WIDE             0       // Old single-wide tailSquare vs. new double-wide tailSquare
+#define SINGLE_KERNEL           0       // Implement tailSquare in a single kernel vs. two kernels
+
 #define _USE_MATH_DEFINES
 #include <cmath>
 
@@ -127,15 +132,15 @@ vector<double2> genSmallTrigCombo(u32 width, u32 middle, u32 size, u32 radix) {
   for (u32 me = 0; me < height / radix; ++me) {
     tab.push_back(root1(width * middle * height, width * middle * me));
   }
-  // Output the two T2 multipliers to be read by one u,v pair of lines
+  // Output the one or two T2 multipliers to be read by one u,v pair of lines
   for (u32 line = 0; line < width * middle / 2; ++line) {
     tab.push_back(root1Fancy(width * middle * height, line));
-    tab.push_back(root1Fancy(width * middle * height, width * middle - line));
+    if (!SINGLE_WIDE) tab.push_back(root1Fancy(width * middle * height, width * middle - line));
   }
 #else
   u32 height = size;
   for (u32 u = 0; u < width * middle / 2; ++u) {
-    for (u32 v = 0; v < 2; ++v) {
+    for (u32 v = 0; v < (SINGLE_WIDE ? 1 : 2); ++v) {
       u32 line = (v == 0) ? u : width * middle - u;
       for (u32 me = 0; me < height / radix; ++me) {
         tab.push_back(root1(width * middle * height, line + width * middle * me));

diff --git a/src/cl/tailsquare.cl b/src/cl/tailsquare.cl
@@ -11,7 +11,7 @@
 #if !defined(SINGLE_WIDE)
 #define SINGLE_WIDE             0       // Old single-wide tailSquare vs. new double-wide tailSquare
 #endif
-#define DOUBLE_WIDE_ONEK        0       // Double-wide tailSquare in a single kernel
+#define SINGLE_KERNEL           0       // Implement tailSquare in a single kernel vs. two kernels
 
 // Why does this alternate implementation work?  Let t' be the conjugate of t and note that t*t' = 1.
 // Now consider these lines from the original implementation (comments appear alongside):
@@ -104,8 +104,13 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   u32 H = ND / SMALL_HEIGHT;
 
+#if SINGLE_KERNEL
   u32 line1 = get_group_id(0);
   u32 line2 = line1 ? H - line1 : (H / 2);
+#else
+  u32 line1 = get_group_id(0) + 1;
+  u32 line2 = H - line1;
+#endif
   u32 memline1 = transPos(line1, MIDDLE, WIDTH);
   u32 memline2 = transPos(line2, MIDDLE, WIDTH);
 
@@ -123,13 +128,31 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   bar();
   fft_HEIGHT(lds, v, smallTrig, w);
 
+  // Compute trig value from scratch.  Good on GPUs with high DP throughput.
+#if PREFER_DP_TO_MEM >= 2
   T2 trig = slowTrig_N(line1 + me * H, ND / NH);
 
-  if (line1) {
-    reverseLine(G_H, lds, v);
-    pairSq(NH, u, v, trig, false);
-    reverseLine(G_H, lds, v);
-  } else {
+  // Do a little bit of memory access and a little bit of DP math.  Good on a Radeon VII.
+#elif PREFER_DP_TO_MEM == 1
+  // Calculate number of trig values used by fft_HEIGHT.
+  // The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
+  u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
+  // Read a hopefully cached line of data and one non-cached T2 per line
+  T2 trig = smallTrig[height_trigs + me];                    // Trig values for line zero, should be cached
+  T2 mult = smallTrig[height_trigs + G_H + line1];           // Line multiplier
+  trig = cmulFancy(trig, mult);
+
+  // On consumer-grade GPUs, it is likely beneficial to read all trig values.
+#else
+  // Calculate number of trig values used by fft_HEIGHT.
+  // The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
+  u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
+  // Read pre-computed trig values
+  T2 trig = smallTrig[height_trigs + line1*G_H + me];
+#endif
+
+#if SINGLE_KERNEL
+  if (line1 == 0) {
     // Line 0 is special: it pairs with itself, offseted by 1.
     reverse(G_H, lds, u + NH/2, true);
     pairSq(NH/2, u,   u + NH/2, trig, true);
@@ -141,6 +164,14 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
     pairSq(NH/2, v,   v + NH/2, trig2, false);
     reverse(G_H, lds, v + NH/2, false);
   }
+  else {
+#else
+  if (1) {
+#endif
+    reverseLine(G_H, lds, v);
+    pairSq(NH, u, v, trig, false);
+    reverseLine(G_H, lds, v);
+  }
 
   bar();
   fft_HEIGHT(lds, v, smallTrig, w);
@@ -181,7 +212,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   u32 H = ND / SMALL_HEIGHT;
 
-#if DOUBLE_WIDE_ONEK
+#if SINGLE_KERNEL
   u32 line_u = get_group_id(0);
   u32 line_v = line_u ? H - line_u : (H / 2);
 #else
@@ -233,7 +264,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
 
   bar(G_H);
 
-#if DOUBLE_WIDE_ONEK
+#if SINGLE_KERNEL
   // Line 0 and H/2 are special: they pair with themselves, line 0 is offseted by 1.
   if (line_u == 0) {
     reverse2(lds, u);