Skip to content

Commit

Permalink
sw: Use a m4 length multipler in the FFT kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
suehtamacv committed Jun 3, 2023
1 parent b9bd1c4 commit 8e76621
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 97 deletions.
6 changes: 3 additions & 3 deletions hw/system/spatz_cluster/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ sw: clean.sw
## Build SW into sw/build with the LLVM toolchain (including tests) for Questasim simulator
sw.vsim: clean.sw bin/spatz_cluster.vsim
mkdir -p sw/build
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vsim -DBUILD_TESTS=ON .. && make
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vsim -DBUILD_TESTS=ON .. && make -j8

## Build SW and run all tests with Questasim simulator
sw.test.vsim: sw.vsim
Expand All @@ -143,7 +143,7 @@ sw.test.vsim: sw.vsim
## Build SW into sw/build with the LLVM toolchain (including tests) for VCS simulator
sw.vcs: clean.sw bin/spatz_cluster.vcs
mkdir -p sw/build
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vcs -DBUILD_TESTS=ON .. && make
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vcs -DBUILD_TESTS=ON .. && make -j8

## Build SW and run all tests with VCS simulator
sw.test.vcs: sw.vcs
Expand All @@ -153,7 +153,7 @@ sw.test.vcs: sw.vcs
## Build SW into sw/build with the LLVM toolchain (including tests) for Verilator simulator
sw.vlt: clean.sw verilate
mkdir -p sw/build
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vlt -DBUILD_TESTS=ON .. && make
cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vlt -DBUILD_TESTS=ON .. && make -j8

## Build SW and run all tests with Verilator simulator
sw.test.vlt: sw.vlt
Expand Down
32 changes: 0 additions & 32 deletions sw/spatzBenchmarks/dp-fft/data/data_128_2.h
Original file line number Diff line number Diff line change
Expand Up @@ -367,38 +367,6 @@ static double twiddle_dram[512]
-0.9891765099647811,
-0.9951847266721969,
-0.9987954562051724,
-1.0,
-0.9987954562051724,
-0.9951847266721969,
-0.989176509964781,
-0.9807852804032303,
-0.970031253194544,
-0.9569403357322088,
-0.9415440651830208,
-0.9238795325112865,
-0.9039892931234431,
-0.8819212643483549,
-0.857728610000272,
-0.8314696123025452,
-0.803207531480645,
-0.7730104533627372,
-0.7409511253549589,
-0.7071067811865475,
-0.6715589548470184,
-0.6343932841636453,
-0.5956993044924332,
-0.555570233019602,
-0.5141027441932216,
-0.47139673682599764,
-0.4275550934302818,
-0.38268343236508967,
-0.3368898533922201,
-0.2902846772544621,
-0.24298017990326382,
-0.19509032201612836,
-0.14673047445536158,
-0.09801714032956059,
-0.049067674327417724,
-2.4492935982947064e-16,
-0.0980171403295605,
-0.19509032201612872,
Expand Down
107 changes: 52 additions & 55 deletions sw/spatzBenchmarks/dp-fft/kernel/fft.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
for (unsigned int bf = 0; bf < log2_nfft; ++bf) {
// Keep half of the samples in a vector register
avl = nfft >> 1;
asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));

// Swap between the two buffers
const double *i_buf = (bf & 1) ? buf : s;
Expand All @@ -68,46 +68,44 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,

// Stripmine the whole vector for this butterfly stage
for (; avl > 0; avl -= vl) {
// Stripmine
asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));

// Load a portion of the vector (real part)
asm volatile("vle64.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
re_u_i += vl;
asm volatile("vle64.v v2, (%0);" ::"r"(re_l_i)); // v2: Re lower wing
asm volatile("vle64.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
re_l_i += vl;

asm volatile("vfadd.vv v8, v0, v2"); // v8: Re butterfly output upper wing
asm volatile(
"vfsub.vv v10, v0, v2"); // v10: Re butterfly output upper wing
"vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
asm volatile(
"vfsub.vv v20, v0, v4"); // v20: Re butterfly output lower wing

// Load a portion of the vector (imag part)
asm volatile("vle64.v v4, (%0);" ::"r"(im_u_i)); // v4: Im upper wing
asm volatile("vle64.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
im_u_i += vl;
asm volatile("vle64.v v6, (%0);" ::"r"(im_l_i)); // v6: Im lower wing
asm volatile("vle64.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
im_l_i += vl;

// Store the results of the last iteration
if (avl != (nfft >> 1)) {
asm volatile("vsuxei16.v v20, (%0), v24" ::"r"(re_l_o));
asm volatile("vsuxei16.v v22, (%0), v24" ::"r"(im_l_o));
}

asm volatile(
"vfadd.vv v12, v4, v6"); // v12: Im butterfly output upper wing
"vfadd.vv v0, v8, v12"); // v0: Im butterfly output upper wing
asm volatile(
"vfsub.vv v14, v4, v6"); // v14: Im butterfly output upper wing
"vfsub.vv v4, v8, v12"); // v4: Im butterfly output lower wing

// Load the twiddle vector
asm volatile("vle64.v v16, (%0);" ::"r"(re_t)); // v16: Re twi
asm volatile("vle64.v v24, (%0);" ::"r"(re_t)); // v24: Re twi
re_t += vl;
asm volatile("vle64.v v18, (%0);" ::"r"(im_t)); // v18: Im twi
asm volatile("vle64.v v28, (%0);" ::"r"(im_t)); // v28: Im twi
im_t += vl;

// Twiddle the lower wing
asm volatile("vfmul.vv v20, v10, v16");
asm volatile("vfmul.vv v22, v14, v16");
asm volatile("vfnmsac.vv v20, v14, v18"); // v20: Re butterfly output
// twiddled lower wing
asm volatile("vfmacc.vv v22, v10, v18"); // v22: Im butterfly output
// twiddled lower wing
asm volatile("vfmul.vv v8, v20, v24");
asm volatile("vfmul.vv v12, v4, v24");
asm volatile("vfnmsac.vv v8, v4, v28"); // v8: Re butterfly output
// twiddled lower wing
asm volatile("vfmacc.vv v12, v20, v28"); // v12: Im butterfly output
// twiddled lower wing

// Load the index vector. If last step, it's the bitrev index vector.
// Otherwise, it's the helper index for the permutations (this is a mask
Expand All @@ -130,22 +128,20 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
im_l_o = im_u_o + (nfft >> 2);
}

asm volatile("vsuxei16.v v8, (%0), v24" ::"r"(re_u_o));
asm volatile("vsuxei16.v v12, (%0), v24" ::"r"(im_u_o));
asm volatile("vsuxei16.v v16, (%0), v24" ::"r"(re_u_o));
asm volatile("vsuxei16.v v0, (%0), v24" ::"r"(im_u_o));
asm volatile("vsuxei16.v v8, (%0), v24" ::"r"(re_l_o));
asm volatile("vsuxei16.v v12, (%0), v24" ::"r"(im_l_o));
}

// Store the results of the last iteration
asm volatile("vsuxei16.v v20, (%0), v24" ::"r"(re_l_o));
asm volatile("vsuxei16.v v22, (%0), v24" ::"r"(im_l_o));
}
}

// The first log2(n_cores) butterflies are special, then, we fall-back into
// the single-core case Hardcoded two-core implementation of FFT Now, the
// fall-back is done directly in the main function. Therefore, this function
// implements just the first butterfly stage of a 2-core implementation.
void fft_2c(double *s, const double *twi, const unsigned int nfft,
const unsigned int cid) {
void fft_2c(const double *s, double *buf, const double *twi,
const unsigned int nfft, const unsigned int cid) {
// Log2(nfft). We can also pass it directly as a function argument
unsigned int log2_nfft = 31 - __builtin_clz(nfft >> 1);

Expand All @@ -160,64 +156,65 @@ void fft_2c(double *s, const double *twi, const unsigned int nfft,
const double *im_t = cid ? twi + (nfft >> 2) * (log2_nfft + 3)
: twi + (nfft >> 2) * (log2_nfft + 2);

asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));

// Overwrite the buffers
const double *i_buf = s + cid * (nfft >> 2);
double *o_buf = s + cid * (nfft >> 2);
double *o_buf = buf + cid * (nfft >> 2);

// Update pointers
const double *re_u_i = i_buf;
const double *im_u_i = i_buf + nfft;
const double *re_l_i = re_u_i + (nfft >> 1);
const double *im_l_i = im_u_i + (nfft >> 1);
double *re_u_o = (double *)re_u_i;
double *im_u_o = (double *)im_u_i;
double *re_l_o = (double *)re_l_i;
double *im_l_o = (double *)im_l_i;
double *re_u_o = o_buf;
double *im_u_o = o_buf + nfft;
double *re_l_o = re_u_o + (nfft >> 1);
double *im_l_o = im_u_o + (nfft >> 1);

// Stripmine the whole vector for this butterfly stage
for (; avl > 0; avl -= vl) {
asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
// Load a portion of the vector
asm volatile("vle64.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
re_u_i += vl;
asm volatile("vle64.v v2, (%0);" ::"r"(re_l_i)); // v2: Re lower wing
asm volatile("vle64.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
re_l_i += vl;
asm volatile("vle64.v v4, (%0);" ::"r"(im_u_i)); // v4: Im upper wing
asm volatile("vle64.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
im_u_i += vl;
asm volatile("vle64.v v6, (%0);" ::"r"(im_l_i)); // v6: Im lower wing
asm volatile("vle64.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
im_l_i += vl;

// Butterfly upper wing
asm volatile("vfadd.vv v8, v0, v2"); // v8: Re butterfly output upper wing
asm volatile("vfadd.vv v12, v4, v6"); // v12: Im butterfly output upper wing
asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
asm volatile(
"vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing
// Butterfly lower wing
asm volatile("vfsub.vv v10, v0, v2"); // v10: Re butterfly output upper wing
asm volatile("vfsub.vv v14, v4, v6"); // v14: Im butterfly output upper wing
asm volatile("vfsub.vv v0, v0, v4"); // v0: Re butterfly output upper wing
asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing

// Load the twiddle vector
asm volatile("vle64.v v16, (%0);" ::"r"(re_t)); // v16: Re twi
asm volatile("vle64.v v8, (%0);" ::"r"(re_t)); // v8: Re twi
re_t += vl;
asm volatile("vle64.v v18, (%0);" ::"r"(im_t)); // v18: Im twi
asm volatile("vle64.v v12, (%0);" ::"r"(im_t)); // v12: Im twi
im_t += vl;

// Twiddle the lower wing
asm volatile("vfmul.vv v20, v10, v16");
asm volatile("vfnmsac.vv v20, v14, v18"); // v20: Re butterfly output
// twiddled lower wing
asm volatile("vfmul.vv v22, v10, v18");
asm volatile("vfmacc.vv v22, v14, v16"); // v22: Im butterfly output
asm volatile("vfmul.vv v24, v0, v8");
asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
// twiddled lower wing
asm volatile("vfmul.vv v28, v0, v12");
asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
// twiddled lower wing

// Store 1:1 the output result
asm volatile("vse64.v v8, (%0)" ::"r"(re_u_o));
asm volatile("vse64.v v16, (%0)" ::"r"(re_u_o));
re_u_o += vl;
asm volatile("vse64.v v12, (%0)" ::"r"(im_u_o));
asm volatile("vse64.v v20, (%0)" ::"r"(im_u_o));
im_u_o += vl;
asm volatile("vse64.v v20, (%0)" ::"r"(re_l_o));
asm volatile("vse64.v v24, (%0)" ::"r"(re_l_o));
re_l_o += vl;
asm volatile("vse64.v v22, (%0)" ::"r"(im_l_o));
asm volatile("vse64.v v28, (%0)" ::"r"(im_l_o));
im_l_o += vl;
}
}
5 changes: 3 additions & 2 deletions sw/spatzBenchmarks/dp-fft/kernel/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ inline void fft_sc(double *s, double *buf, const double *twi,
const unsigned int nfft) __attribute__((always_inline));

// Dual-core
inline void fft_2c(double *s, const double *twi, const unsigned int nfft,
const unsigned int cid) __attribute__((always_inline));
inline void fft_2c(const double *s, double *buf, const double *twi,
const unsigned int nfft, const unsigned int cid)
__attribute__((always_inline));

#endif
8 changes: 4 additions & 4 deletions sw/spatzBenchmarks/dp-fft/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ int main() {
snrt_cluster_hw_barrier();

// Calculate pointers for the second butterfly onwards
double *s_ = samples + cid * (NFFT >> 1);
double *buf_ = buffer + cid * (NFFT >> 1);
double *twi_ = twiddle + (NFFT >> 2);
double *s_ = samples + cid * (NFFT >> 2);
double *buf_ = buffer + cid * (NFFT >> 2);
double *twi_ = twiddle + (NFFT >> 1);

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand All @@ -95,7 +95,7 @@ int main() {
start_kernel();

// First stage
fft_2c(samples, twiddle, NFFT, cid);
fft_2c(samples, buffer, twiddle, NFFT, cid);

// Wait for all cores to finish the first stage
snrt_cluster_hw_barrier();
Expand Down
2 changes: 1 addition & 1 deletion sw/spatzBenchmarks/dp-fft/script/gen_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def main():
tbuf[N_T_BUF:2 * N_T_BUF] = twiddle_v_s[1::2]
# Attach 1bf img part
twiddle_vec_reim = np.concatenate(
(twiddle_vec_reim[:N_TWID_V], tbuf[N_T_BUF:N_T_BUF + NFFT // 2], twiddle_vec_reim[N_TWID_V:]))
(twiddle_vec_reim[:N_TWID_V], tbuf[N_T_BUF:N_T_BUF + NFFTh // 2], twiddle_vec_reim[N_TWID_V:]))
# Attach 1bf real part
twiddle_vec_reim = np.concatenate((tbuf[:NFFTh // 2], twiddle_vec_reim))

Expand Down

0 comments on commit 8e76621

Please sign in to comment.