Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More stride elimination work for AMD GPUs #306

Merged
merged 4 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/Gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,13 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
defines += toDefine("TRIG_SIN", coefs.sinCoefs);
defines += toDefine("TRIG_COS", coefs.cosCoefs);

// Calculate fractional bits-per-word = (E % N) / N * 2^64
u32 bpw_hi = (u64(E % N) << 32) / N;
u32 bpw_lo = (((u64(E % N) << 32) % N) << 32) / N;
defines += toDefine("FRAC_BPW_HI", bpw_hi);
defines += toDefine("FRAC_BPW_LO", bpw_lo);
u64 bpw = (u64(bpw_hi) << 32) + bpw_lo;
bpw--; // bpw must not be an exact value -- it must be less than exact value to get last biglit value right
defines += toDefine("FRAC_BPW_HI", (u32) (bpw >> 32));
defines += toDefine("FRAC_BPW_LO", (u32) bpw);
u32 bigstep = (bpw * (N / fft.shape.nW())) >> 32;
defines += toDefine("FRAC_BITS_BIGSTEP", bigstep);

Expand Down Expand Up @@ -458,9 +460,9 @@ Gpu::Gpu(Queue* q, GpuCommon shared, FFTConfig fft, u32 E, const vector<KeyVal>&
BUF(bufROE, ROE_SIZE),
BUF(bufStatsCarry, CARRY_SIZE),

BUF(buf1, N),
BUF(buf2, N),
BUF(buf3, N),
BUF(buf1, N + N/4), // Let's us play with padding instead of rotating. Need to calculate actual cost of padding
BUF(buf2, N + N/4),
BUF(buf3, N + N/4),
#undef BUF

statsBits{u32(args.value("STATS", 0))},
Expand Down
106 changes: 2 additions & 104 deletions src/cl/base.cl
Original file line number Diff line number Diff line change
Expand Up @@ -151,117 +151,15 @@ void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base);
void bar(void);

void read(u32 WG, u32 N, T2 *u, const global T2 *in, u32 base) {
for (u32 i = 0; i < N; ++i) { u[i] = in[base + i * WG + (u32) get_local_id(0)]; }
in += base + (u32) get_local_id(0);
for (u32 i = 0; i < N; ++i) { u[i] = in[i * WG]; }
}

void write(u32 WG, u32 N, T2 *u, global T2 *out, u32 base) {
out += base + (u32) get_local_id(0);
for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
}

// Parameters we may want to let user tune. WIDTH other than 512 and 1K is untested. SMALL_HEIGHT other than 256 and 512 is untested.
#define ROTATION 1 // Turns on rotating width and small_height rows
#define WIDTH_ROTATE_CHUNK_SIZE 32 // Rotate blocks of 32 T2 values = 512 bytes
#define HEIGHT_ROTATE_CHUNK_SIZE 16 // Rotate blocks of 16 T2 values = 256 bytes
#define VARIABLE_WIDTH_ROTATE 0 // Each width u[i] gets a different rotation amount
#define MIDDLE_SHUFFLE_WRITE 1 // Radeon VII likes MiddleShuffleWrite, Titan V apparently not

// nVidia Titan V hates rotating and LDS-less middle writes
#if !AMDGPU
#undef ROTATION
#define ROTATION 0
#undef MIDDLE_SHUFFLE_WRITE
#define MIDDLE_SHUFFLE_WRITE 0
#endif

// Rotate width elements on output from fft_WIDTH and as input to fftMiddleIn.
// Not all lines are rotated the same amount so that fftMiddleIn reads a more varied distribution of addresses.
// This can be faster on AMD GPUs, not certain about nVidia GPUs.
u32 rotate_width_amount(u32 y) {
#if !VARIABLE_WIDTH_ROTATE
u32 num_sections = WIDTH / WIDTH_ROTATE_CHUNK_SIZE;
u32 num_rotates = y % num_sections; // if y increments by SMALL_HEIGHT, final rotate amount won't change after applying "mod WIDTH"
#else
// Create a formula where each u[i] gets a different rotation amount
u32 num_sections = WIDTH / WIDTH_ROTATE_CHUNK_SIZE;
u32 num_rotates = y % num_sections * MIDDLE; // each increment of y adds MIDDLE
num_rotates += y / SMALL_HEIGHT; // each increment of i in u[i] will add 1
num_rotates &= 255; // keep num_rotates small (probably not necessary)
#endif
return num_rotates * WIDTH_ROTATE_CHUNK_SIZE;
}
u32 rotate_width_x(u32 x, u32 rot_amt) { // rotate x coordinate using a cached rotate_amount
return (x + rot_amt) % WIDTH;
}
u32 rotate_width(u32 y, u32 x) { // rotate x coordinate (no cached rotate amount)
return rotate_width_x(x, rotate_width_amount(y));
}
void readRotatedWidth(T2 *u, CP(T2) in, u32 y, u32 x) {
#if !ROTATION // No rotation, might be better on nVidia cards
in += y * WIDTH + x;
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
#elif !VARIABLE_WIDTH_ROTATE // True if adding SMALL_HEIGHT to y results in same rotation amount
in += y * WIDTH + rotate_width(y, x);
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
#else // Adding SMALL_HEIGHT to y results in different rotation
in += y * WIDTH;
u32 rot_amt = rotate_width_amount(y);
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[rotate_width_x(x, rot_amt)]; in += SMALL_HEIGHT * WIDTH; rot_amt += SMALL_HEIGHT * WIDTH_ROTATE_CHUNK_SIZE; }
#endif
}
void writeRotatedWidth(u32 WG, u32 N, T2 *u, P(T2) out, u32 line) {
#if !ROTATION // No rotation, might be better on nVidia cards
out += line * WIDTH + (u32) get_local_id(0);
for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
#else
u32 me = (u32) get_local_id(0);
u32 rot_amt = rotate_width_amount(line);
out += line * WIDTH;
for (u32 i = 0; i < N; ++i) { out[rotate_width_x (i * WG + me, rot_amt)] = u[i]; }
#endif
}

// Rotate height elements on output from fft_HEIGHT and as input to fftMiddleOut.
// Not all lines are rotated the same amount so that fftMiddleOut reads a more varied distribution of addresses.
// This can be faster on AMD GPUs, not certain about nVidia GPUs.
u32 rotate_height_amount (u32 y) {
u32 num_sections = SMALL_HEIGHT / HEIGHT_ROTATE_CHUNK_SIZE;
return (y % num_sections) * (HEIGHT_ROTATE_CHUNK_SIZE);
}
u32 rotate_height_x(u32 x, u32 rot_amt) { // rotate x coordinate using a cached rotate_amount
return (x + rot_amt) % SMALL_HEIGHT;
}
u32 rotate_height(u32 y, u32 x) { // rotate x coordinate (no cached rotate amount)
return rotate_height_x(x, rotate_height_amount(y));
}
void readRotatedHeight(T2 *u, CP(T2) in, u32 y, u32 x) {
#if !ROTATION // No rotation, might be better on nVidia cards
in += y * MIDDLE * SMALL_HEIGHT + x;
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
#elif 0 // Set if adding 1 to y results in same rotation
y *= MIDDLE;
in += y * SMALL_HEIGHT + rotate_height(y, x);
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
#else // Adding SMALL_HEIGHT to line results in different rotation
y *= MIDDLE;
in += y * SMALL_HEIGHT;
u32 rot_amt = rotate_height_amount(y);
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[rotate_height_x(x, rot_amt)]; in += SMALL_HEIGHT; rot_amt += HEIGHT_ROTATE_CHUNK_SIZE; }
#endif
}
void writeRotatedHeight(u32 WG, u32 N, T2 *u, P(T2) out, u32 line) {
#if !ROTATION // No rotation, might be better on nVidia cards
out += line * SMALL_HEIGHT + (u32) get_local_id(0);
for (u32 i = 0; i < N; ++i) { out[i * WG] = u[i]; }
#else
u32 me = (u32) get_local_id(0);
u32 rot_amt = rotate_height_amount(line);
out += line * SMALL_HEIGHT;
for (u32 i = 0; i < N; ++i) { out[rotate_height_x (i * WG + me, rot_amt)] = u[i]; }
#endif
}


T2 U2(T a, T b) { return (T2) (a, b); }

void bar() {
Expand Down
7 changes: 5 additions & 2 deletions src/cl/carryfused.cl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
// On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather read the flags from memory.
// On Radeon VII this code is about he same speed. Not sure which is better on other GPUs.
#if BIGLIT
u32 frac_bits = (u32) (((me * H + line) * 2 * ((((u64) FRAC_BPW_HI) << 32) + FRAC_BPW_LO)) >> 32);
// Calculate the most significant 32-bits of FRAC_BPW * the index of the FFT word
u32 fft_word_index = (me * H + line) * 2;
u32 frac_bits = fft_word_index * FRAC_BPW_HI + (u32) ((fft_word_index * (u64) FRAC_BPW_LO) >> 32);
// Perform addition to test first biglit flag
u32 tmp = frac_bits + FRAC_BPW_HI;
#endif

Expand Down Expand Up @@ -170,7 +173,7 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
bar();

fft_WIDTH(lds, u, smallTrig);
writeRotatedWidth(G_W, NW, u, out, line);
writeCarryFusedLine(u, out, line);

// Clear carry ready flag for next iteration
#if OLD_FENCE
Expand Down
12 changes: 7 additions & 5 deletions src/cl/fftmiddlein.cl
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,23 @@ KERNEL(IN_WG) fftMiddleIn(P(T2) out, CP(T2) in, Trig trig) {
u32 x = startx + mx;
u32 y = starty + my;

readRotatedWidth(u, in, y, x);
readMiddleInLine(u, in, y, x);

middleMul2(u, x, y, 1, trig);

fft_MIDDLE(u);

middleMul(u, y, trig);

#if !MIDDLE_SHUFFLE_WRITE
#if MIDDLE_IN_LDS_TRANSPOSE
// Transpose the x and y values
local T lds[IN_WG / 2 * (MIDDLE <= 8 ? 2 * MIDDLE : MIDDLE)];
middleShuffle(lds, u, IN_WG, IN_SIZEX);
write(IN_WG, MIDDLE, u, out, gx * (BIG_HEIGHT * IN_SIZEX) + gy * (MIDDLE * IN_WG));
out += me; // Threads write sequentially to memory since x and y values are already transposed
#else
out += gx * (BIG_HEIGHT * IN_SIZEX) + gy * (MIDDLE * IN_WG);
middleShuffleWrite(out, u, IN_WG, IN_SIZEX);
// Adjust out pointer to effect a transpose of x and y values
out += mx * SIZEY + my;
#endif

writeMiddleInLine(out, u, gy, gx);
}
17 changes: 9 additions & 8 deletions src/cl/fftmiddleout.cl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ KERNEL(OUT_WG) fftMiddleOut(P(T2) out, P(T2) in, Trig trig) {
u32 x = startx + mx;
u32 y = starty + my;

readRotatedHeight(u, in, y, x);
readMiddleOutLine(u, in, y, x);

middleMul(u, startx + mx, trig);
middleMul(u, x, trig);

fft_MIDDLE(u);

Expand All @@ -42,16 +42,17 @@ KERNEL(OUT_WG) fftMiddleOut(P(T2) out, P(T2) in, Trig trig) {
// number. This may be due to roundoff errors introduced by applying inexact TWO_TO_N_8TH weights.
double factor = 1.0 / (4 * 4 * NWORDS);

middleMul2(u, starty + my, startx + mx, factor, trig);
middleMul2(u, y, x, factor, trig);

#if !MIDDLE_SHUFFLE_WRITE
#if MIDDLE_OUT_LDS_TRANSPOSE
// Transpose the x and y values
local T lds[OUT_WG / 2 * (MIDDLE <= 8 ? 2 * MIDDLE : MIDDLE)];
middleShuffle(lds, u, OUT_WG, OUT_SIZEX);
out += MIDDLE * WIDTH * OUT_SIZEX * gx + MIDDLE * OUT_WG * gy + me;
for (i32 i = 0; i < MIDDLE; ++i) { out[OUT_WG * i] = u[i]; }
out += me; // Threads write sequentially to memory since x and y values are already transposed
#else
out += MIDDLE * WIDTH * OUT_SIZEX * gx + MIDDLE * OUT_WG * gy;
middleShuffleWrite(out, u, OUT_WG, OUT_SIZEX);
// Adjust out pointer to effect a transpose of x and y values
out += mx * SIZEY + my;
#endif

writeMiddleOutLine(out, u, gy, gx);
}
3 changes: 2 additions & 1 deletion src/cl/fftp.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "math.cl"
#include "weight.cl"
#include "fftwidth.cl"
#include "middle.cl"

// fftPremul: weight words with IBDWT weights followed by FFT-width.
KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS) {
Expand All @@ -28,5 +29,5 @@ KERNEL(G_W) fftP(P(T2) out, CP(Word2) in, Trig smallTrig, BigTab THREAD_WEIGHTS)

fft_WIDTH(lds, u, smallTrig);

writeRotatedWidth(G_W, NW, u, out, g);
writeCarryFusedLine(u, out, g);
}
Loading
Loading