Skip to content

Commit

Permalink
Consistent naming tailSquare() tailMul()
Browse files Browse the repository at this point in the history
  • Loading branch information
preda committed Apr 25, 2024
1 parent 37e0c48 commit 408dc31
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 25 deletions.
21 changes: 8 additions & 13 deletions src/Gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,11 +349,11 @@ Gpu::Gpu(const Args& args, u32 E, u32 W, u32 BIG_H, u32 SMALL_H, u32 nW, u32 nH,
K(transposeIn, "transpose.cl", "transposeIn", "", 64, hN / 64),
K(transposeOut, "transpose.cl", "transposeOut", "", 64, hN / 64),

K(tailSquare, "tailsquare.cl", "tailFusedSquare", "-DTAIL_FUSED_LOW=0", SMALL_H / nH, hN / nH / 2),
K(tailSquareLow, "tailsquare.cl", "tailFusedSquare", "-DTAIL_FUSED_LOW=1", SMALL_H / nH, hN / nH / 2),
K(tailSquare, "tailsquare.cl", "tailSquare", "-DMUL_LOW=0", SMALL_H / nH, hN / nH / 2),
K(tailSquareLow, "tailsquare.cl", "tailSquare", "-DMUL_LOW=1", SMALL_H / nH, hN / nH / 2),

K(tailFusedMul, "tailfusedmul.cl", "tailFusedMul", "-DMUL_LOW=0", SMALL_H / nH, hN / nH / 2),
K(tailFusedMulLow, "tailfusedmul.cl", "tailFusedMul", "-DMUL_LOW=1", SMALL_H / nH, hN / nH / 2),
K(tailMul, "tailmul.cl", "tailMul", "-DMUL_LOW=0", SMALL_H / nH, hN / nH / 2),
K(tailMulLow, "tailmul.cl", "tailMul", "-DMUL_LOW=1", SMALL_H / nH, hN / nH / 2),

K(readResidue, "etc.cl", "readResidue", "", 64, 64),
K(isNotZero, "etc.cl", "isNotZero", "", 256, 256 * 256),
Expand Down Expand Up @@ -396,7 +396,7 @@ Gpu::Gpu(const Args& args, u32 E, u32 W, u32 BIG_H, u32 SMALL_H, u32 nW, u32 nH,
&fftP, &fftW, &fftHin, &fftHout,
&fftMiddleIn, &fftMiddleOut, &kernCarryA, &kernCarryM, &carryB,
&transposeIn, &transposeOut,
&tailFusedMulLow, &tailFusedMul, &tailSquare, &tailSquareLow,
&tailMulLow, &tailMul, &tailSquare, &tailSquareLow,
&readResidue, &isNotZero, &isEqual, &sum64}) {
k->load(compiler, device);
}
Expand All @@ -416,8 +416,8 @@ Gpu::Gpu(const Args& args, u32 E, u32 W, u32 BIG_H, u32 SMALL_H, u32 nW, u32 nH,
kernCarryA.setFixedArgs(3, bufCarry, bufBitsC, bufROE, bufThreadWeights, bufCarryWeights);
kernCarryM.setFixedArgs(3, bufCarry, bufBitsC, bufROE, bufThreadWeights, bufCarryWeights);
carryB.setFixedArgs(1, bufCarry, bufBitsC);
tailFusedMulLow.setFixedArgs(3, bufTrigH, bufTrig2SH, bufTrigBHW);
tailFusedMul.setFixedArgs(3, bufTrigH, bufTrig2SH, bufTrigBHW);
tailMulLow.setFixedArgs(3, bufTrigH, bufTrig2SH, bufTrigBHW);
tailMul.setFixedArgs(3, bufTrigH, bufTrig2SH, bufTrigBHW);
tailSquare.setFixedArgs(2, bufTrigH, bufTrig2SH, bufTrigBHW);
tailSquareLow.setFixedArgs(2, bufTrigH, bufTrig2SH, bufTrigBHW);

Expand Down Expand Up @@ -555,10 +555,6 @@ vector<u32> Gpu::readAndCompress(ConstBuffer<int>& buf) {
vector<u32> Gpu::readCheck() { return readAndCompress(bufCheck); }
vector<u32> Gpu::readData() { return readAndCompress(bufData); }

void Gpu::tailMul(Buffer<double>& out, Buffer<double>& in, Buffer<double>& inTmp) {
tailFusedMul(out, in, inTmp);
}

// out := inA * inB;
void Gpu::mul(Buffer<int>& out, Buffer<int>& inA, Buffer<double>& inB, Buffer<double>& tmp1, Buffer<double>& tmp2, bool mul3) {
fftP(tmp1, inA);
Expand All @@ -585,7 +581,6 @@ void Gpu::mul(Buffer<int>& io, Buffer<int>& inB) { mul(io, io, inB); }

void Gpu::mul(Buffer<int>& io, Buffer<double>& buf1) {
// We know that coreStep() stores double output in buf1; so we're going to use buf2 & buf3 for temps.
// tW(buf2, buf1);
mul(io, io, buf1, buf2, buf3, false);
}

Expand Down Expand Up @@ -772,7 +767,7 @@ void Gpu::exponentiateCore(Buffer<double>& out, const Buffer<double>& base, u64
if (testBit(exp, p)) {
doCarry(tmp, out);
fftMiddleIn(out, tmp);
tailFusedMulLow(tmp, out, base);
tailMulLow(tmp, out, base);
fftMiddleOut(out, tmp);
}

Expand Down
6 changes: 2 additions & 4 deletions src/Gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ class Gpu {

Kernel tailSquare;
Kernel tailSquareLow;
Kernel tailFusedMul;
Kernel tailFusedMulLow;
Kernel tailMul;
Kernel tailMulLow;

Kernel readResidue;
Kernel isNotZero;
Expand Down Expand Up @@ -162,9 +162,7 @@ class Gpu {

void topHalf(Buffer<double>& out, Buffer<double>& inTmp);
void writeState(const vector<u32> &check, u32 blockSize, Buffer<double>&, Buffer<double>&, Buffer<double>&);
void tailMul(Buffer<double>& out, Buffer<double>& in, Buffer<double>& inTmp);


Gpu(const Args& args, u32 E, u32 W, u32 BIG_H, u32 SMALL_H, u32 nW, u32 nH,
cl_device_id device, bool timeKernels, bool useLongCarry, struct Weights&& weights);

Expand Down
10 changes: 5 additions & 5 deletions src/bundle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2485,7 +2485,7 @@ void middleShuffle(local T *lds, T2 *u, u32 workgroupSize, u32 blockSize) {
}
)cltag",

// src/cl/tailfusedmul.cl
// src/cl/tailmul.cl
R"cltag(
// Copyright (C) Mihai Preda and George Woltman
Expand Down Expand Up @@ -2547,7 +2547,7 @@ void pairMul(u32 N, T2 *u, T2 *v, T2 *p, T2 *q, T2 base_squared, bool special) {
}
}
KERNEL(G_H) tailFusedMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig,
KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig,
BigTab TRIG_2SH, BigTab TRIG_BHW) {
// The arguments smallTrig1, smallTrig2 point to the same data; they are passed in as two buffers instead of one
// in order to work-around the ROCm optimizer which would otherwise "cache" the data once read into VGPRs, leading
Expand Down Expand Up @@ -2678,7 +2678,7 @@ void pairSq(u32 N, T2 *u, T2 *v, T2 base_squared, bool special) {
}
}
KERNEL(G_H) tailFusedSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2SH, BigTab TRIG_BHW) {
KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2SH, BigTab TRIG_BHW) {
local T2 lds[SMALL_HEIGHT / 2];
T2 u[NH], v[NH];
Expand All @@ -2691,7 +2691,7 @@ KERNEL(G_H) tailFusedSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2S
u32 memline1 = transPos(line1, MIDDLE, WIDTH);
u32 memline2 = transPos(line2, MIDDLE, WIDTH);
#if TAIL_FUSED_LOW
#if MUL_LOW
read(G_H, NH, u, in, memline1 * SMALL_HEIGHT);
read(G_H, NH, v, in, memline2 * SMALL_HEIGHT);
#else
Expand Down Expand Up @@ -3006,6 +3006,6 @@ double2 slowTrig_N(u32 k, u32 kBound, BigTab TRIG_BHW) {
)cltag",

};
static const std::vector<const char*> CL_FILE_NAMES{"carry.cl","carryb.cl","carryfused.cl","carryinc.cl","carryutil.cl","etc.cl","fft10.cl","fft11.cl","fft12.cl","fft13.cl","fft14.cl","fft15.cl","fft5.cl","fft6.cl","fft7.cl","fft9.cl","fftheight.cl","ffthin.cl","ffthout.cl","fftmiddlein.cl","fftmiddleout.cl","fftp.cl","fftw.cl","fftwidth.cl","gpuowl.cl","middle.cl","tailfusedmul.cl","tailsquare.cl","tailutil.cl","transpose.cl","trig.cl",};
static const std::vector<const char*> CL_FILE_NAMES{"carry.cl","carryb.cl","carryfused.cl","carryinc.cl","carryutil.cl","etc.cl","fft10.cl","fft11.cl","fft12.cl","fft13.cl","fft14.cl","fft15.cl","fft5.cl","fft6.cl","fft7.cl","fft9.cl","fftheight.cl","ffthin.cl","ffthout.cl","fftmiddlein.cl","fftmiddleout.cl","fftp.cl","fftw.cl","fftwidth.cl","gpuowl.cl","middle.cl","tailmul.cl","tailsquare.cl","tailutil.cl","transpose.cl","trig.cl",};
const std::vector<const char*>& getClFileNames() { return CL_FILE_NAMES; }
const std::vector<const char*>& getClFiles() { return CL_FILES; }
2 changes: 1 addition & 1 deletion src/cl/tailfusedmul.cl → src/cl/tailmul.cl
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void pairMul(u32 N, T2 *u, T2 *v, T2 *p, T2 *q, T2 base_squared, bool special) {
}
}

KERNEL(G_H) tailFusedMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig,
KERNEL(G_H) tailMul(P(T2) out, CP(T2) in, CP(T2) a, Trig smallTrig,
BigTab TRIG_2SH, BigTab TRIG_BHW) {
// The arguments smallTrig1, smallTrig2 point to the same data; they are passed in as two buffers instead of one
// in order to work-around the ROCm optimizer which would otherwise "cache" the data once read into VGPRs, leading
Expand Down
4 changes: 2 additions & 2 deletions src/cl/tailsquare.cl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void pairSq(u32 N, T2 *u, T2 *v, T2 base_squared, bool special) {
}
}

KERNEL(G_H) tailFusedSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2SH, BigTab TRIG_BHW) {
KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2SH, BigTab TRIG_BHW) {
local T2 lds[SMALL_HEIGHT / 2];

T2 u[NH], v[NH];
Expand All @@ -68,7 +68,7 @@ KERNEL(G_H) tailFusedSquare(P(T2) out, CP(T2) in, Trig smallTrig, BigTab TRIG_2S
u32 memline1 = transPos(line1, MIDDLE, WIDTH);
u32 memline2 = transPos(line2, MIDDLE, WIDTH);

#if TAIL_FUSED_LOW
#if MUL_LOW
read(G_H, NH, u, in, memline1 * SMALL_HEIGHT);
read(G_H, NH, v, in, memline2 * SMALL_HEIGHT);
#else
Expand Down

0 comments on commit 408dc31

Please sign in to comment.