sw: Use a m4 length multipler in the FFT kernel

xiaorui-yin · Jun 3, 2023 · 8e76621 · 8e76621
1 parent b9bd1c4
commit 8e76621
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 97 deletions.
diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile
@@ -133,7 +133,7 @@ sw: clean.sw
 ## Build SW into sw/build with the LLVM toolchain (including tests) for Questasim simulator
 sw.vsim: clean.sw bin/spatz_cluster.vsim
 	mkdir -p sw/build
-	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vsim -DBUILD_TESTS=ON .. && make
+	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vsim -DBUILD_TESTS=ON .. && make -j8
 
 ## Build SW and run all tests with Questasim simulator
 sw.test.vsim: sw.vsim
@@ -143,7 +143,7 @@ sw.test.vsim: sw.vsim
 ## Build SW into sw/build with the LLVM toolchain (including tests) for VCS simulator
 sw.vcs: clean.sw bin/spatz_cluster.vcs
 	mkdir -p sw/build
-	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vcs -DBUILD_TESTS=ON .. && make
+	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vcs -DBUILD_TESTS=ON .. && make -j8
 
 ## Build SW and run all tests with VCS simulator
 sw.test.vcs: sw.vcs
@@ -153,7 +153,7 @@ sw.test.vcs: sw.vcs
 ## Build SW into sw/build with the LLVM toolchain (including tests) for Verilator simulator
 sw.vlt: clean.sw verilate
 	mkdir -p sw/build
-	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vlt -DBUILD_TESTS=ON .. && make
+	cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DSNITCH_SIMULATOR=../../../../../hw/system/spatz_cluster/bin/spatz_cluster.vlt -DBUILD_TESTS=ON .. && make -j8
 
 ## Build SW and run all tests with Verilator simulator
 sw.test.vlt: sw.vlt

diff --git a/sw/spatzBenchmarks/dp-fft/data/data_128_2.h b/sw/spatzBenchmarks/dp-fft/data/data_128_2.h
@@ -367,38 +367,6 @@ static double twiddle_dram[512]
                                          -0.9891765099647811,
                                          -0.9951847266721969,
                                          -0.9987954562051724,
-                                         -1.0,
-                                         -0.9987954562051724,
-                                         -0.9951847266721969,
-                                         -0.989176509964781,
-                                         -0.9807852804032303,
-                                         -0.970031253194544,
-                                         -0.9569403357322088,
-                                         -0.9415440651830208,
-                                         -0.9238795325112865,
-                                         -0.9039892931234431,
-                                         -0.8819212643483549,
-                                         -0.857728610000272,
-                                         -0.8314696123025452,
-                                         -0.803207531480645,
-                                         -0.7730104533627372,
-                                         -0.7409511253549589,
-                                         -0.7071067811865475,
-                                         -0.6715589548470184,
-                                         -0.6343932841636453,
-                                         -0.5956993044924332,
-                                         -0.555570233019602,
-                                         -0.5141027441932216,
-                                         -0.47139673682599764,
-                                         -0.4275550934302818,
-                                         -0.38268343236508967,
-                                         -0.3368898533922201,
-                                         -0.2902846772544621,
-                                         -0.24298017990326382,
-                                         -0.19509032201612836,
-                                         -0.14673047445536158,
-                                         -0.09801714032956059,
-                                         -0.049067674327417724,
                                          -2.4492935982947064e-16,
                                          -0.0980171403295605,
                                          -0.19509032201612872,

diff --git a/sw/spatzBenchmarks/dp-fft/kernel/fft.c b/sw/spatzBenchmarks/dp-fft/kernel/fft.c
@@ -46,7 +46,7 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
   for (unsigned int bf = 0; bf < log2_nfft; ++bf) {
     // Keep half of the samples in a vector register
     avl = nfft >> 1;
-    asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
+    asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
 
     // Swap between the two buffers
     const double *i_buf = (bf & 1) ? buf : s;
@@ -68,46 +68,44 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
 
     // Stripmine the whole vector for this butterfly stage
     for (; avl > 0; avl -= vl) {
+      // Stripmine
+      asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
       // Load a portion of the vector (real part)
       asm volatile("vle64.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
       re_u_i += vl;
-      asm volatile("vle64.v v2, (%0);" ::"r"(re_l_i)); // v2: Re lower wing
+      asm volatile("vle64.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
       re_l_i += vl;
 
-      asm volatile("vfadd.vv v8, v0, v2"); // v8: Re butterfly output upper wing
       asm volatile(
-          "vfsub.vv v10, v0, v2"); // v10: Re butterfly output upper wing
+          "vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
+      asm volatile(
+          "vfsub.vv v20, v0, v4"); // v20: Re butterfly output lower wing
 
       // Load a portion of the vector (imag part)
-      asm volatile("vle64.v v4, (%0);" ::"r"(im_u_i)); // v4: Im upper wing
+      asm volatile("vle64.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
       im_u_i += vl;
-      asm volatile("vle64.v v6, (%0);" ::"r"(im_l_i)); // v6: Im lower wing
+      asm volatile("vle64.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
       im_l_i += vl;
 
-      // Store the results of the last iteration
-      if (avl != (nfft >> 1)) {
-        asm volatile("vsuxei16.v v20, (%0), v24" ::"r"(re_l_o));
-        asm volatile("vsuxei16.v v22, (%0), v24" ::"r"(im_l_o));
-      }
-
       asm volatile(
-          "vfadd.vv v12, v4, v6"); // v12: Im butterfly output upper wing
+          "vfadd.vv v0, v8, v12"); // v0: Im butterfly output upper wing
       asm volatile(
-          "vfsub.vv v14, v4, v6"); // v14: Im butterfly output upper wing
+          "vfsub.vv v4, v8, v12"); // v4: Im butterfly output lower wing
 
       // Load the twiddle vector
-      asm volatile("vle64.v v16, (%0);" ::"r"(re_t)); // v16: Re twi
+      asm volatile("vle64.v v24, (%0);" ::"r"(re_t)); // v24: Re twi
       re_t += vl;
-      asm volatile("vle64.v v18, (%0);" ::"r"(im_t)); // v18: Im twi
+      asm volatile("vle64.v v28, (%0);" ::"r"(im_t)); // v28: Im twi
       im_t += vl;
 
       // Twiddle the lower wing
-      asm volatile("vfmul.vv v20, v10, v16");
-      asm volatile("vfmul.vv v22, v14, v16");
-      asm volatile("vfnmsac.vv v20, v14, v18"); // v20: Re butterfly output
-                                                // twiddled lower wing
-      asm volatile("vfmacc.vv v22, v10, v18");  // v22: Im butterfly output
-                                                // twiddled lower wing
+      asm volatile("vfmul.vv v8, v20, v24");
+      asm volatile("vfmul.vv v12, v4, v24");
+      asm volatile("vfnmsac.vv v8, v4, v28");  // v8: Re butterfly output
+                                               // twiddled lower wing
+      asm volatile("vfmacc.vv v12, v20, v28"); // v12: Im butterfly output
+                                               // twiddled lower wing
 
       // Load the index vector. If last step, it's the bitrev index vector.
       // Otherwise, it's the helper index for the permutations (this is a mask
@@ -130,22 +128,20 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
         im_l_o = im_u_o + (nfft >> 2);
       }
 
-      asm volatile("vsuxei16.v v8, (%0), v24" ::"r"(re_u_o));
-      asm volatile("vsuxei16.v v12, (%0), v24" ::"r"(im_u_o));
+      asm volatile("vsuxei16.v v16, (%0), v24" ::"r"(re_u_o));
+      asm volatile("vsuxei16.v v0, (%0), v24" ::"r"(im_u_o));
+      asm volatile("vsuxei16.v v8, (%0), v24" ::"r"(re_l_o));
+      asm volatile("vsuxei16.v v12, (%0), v24" ::"r"(im_l_o));
     }
-
-    // Store the results of the last iteration
-    asm volatile("vsuxei16.v v20, (%0), v24" ::"r"(re_l_o));
-    asm volatile("vsuxei16.v v22, (%0), v24" ::"r"(im_l_o));
   }
 }
 
 // The first log2(n_cores) butterflies are special, then, we fall-back into
 // the single-core case Hardcoded two-core implementation of FFT Now, the
 // fall-back is done directly in the main function. Therefore, this function
 // implements just the first butterfly stage of a 2-core implementation.
-void fft_2c(double *s, const double *twi, const unsigned int nfft,
-            const unsigned int cid) {
+void fft_2c(const double *s, double *buf, const double *twi,
+            const unsigned int nfft, const unsigned int cid) {
   // Log2(nfft). We can also pass it directly as a function argument
   unsigned int log2_nfft = 31 - __builtin_clz(nfft >> 1);
 
@@ -160,64 +156,65 @@ void fft_2c(double *s, const double *twi, const unsigned int nfft,
   const double *im_t = cid ? twi + (nfft >> 2) * (log2_nfft + 3)
                            : twi + (nfft >> 2) * (log2_nfft + 2);
 
-  asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
+  asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
 
   // Overwrite the buffers
   const double *i_buf = s + cid * (nfft >> 2);
-  double *o_buf = s + cid * (nfft >> 2);
+  double *o_buf = buf + cid * (nfft >> 2);
 
   // Update pointers
   const double *re_u_i = i_buf;
   const double *im_u_i = i_buf + nfft;
   const double *re_l_i = re_u_i + (nfft >> 1);
   const double *im_l_i = im_u_i + (nfft >> 1);
-  double *re_u_o = (double *)re_u_i;
-  double *im_u_o = (double *)im_u_i;
-  double *re_l_o = (double *)re_l_i;
-  double *im_l_o = (double *)im_l_i;
+  double *re_u_o = o_buf;
+  double *im_u_o = o_buf + nfft;
+  double *re_l_o = re_u_o + (nfft >> 1);
+  double *im_l_o = im_u_o + (nfft >> 1);
 
   // Stripmine the whole vector for this butterfly stage
   for (; avl > 0; avl -= vl) {
-    asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(vl) : "r"(avl));
+    asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
     // Load a portion of the vector
     asm volatile("vle64.v v0, (%0);" ::"r"(re_u_i)); // v0: Re upper wing
     re_u_i += vl;
-    asm volatile("vle64.v v2, (%0);" ::"r"(re_l_i)); // v2: Re lower wing
+    asm volatile("vle64.v v4, (%0);" ::"r"(re_l_i)); // v4: Re lower wing
     re_l_i += vl;
-    asm volatile("vle64.v v4, (%0);" ::"r"(im_u_i)); // v4: Im upper wing
+    asm volatile("vle64.v v8, (%0);" ::"r"(im_u_i)); // v8: Im upper wing
     im_u_i += vl;
-    asm volatile("vle64.v v6, (%0);" ::"r"(im_l_i)); // v6: Im lower wing
+    asm volatile("vle64.v v12, (%0);" ::"r"(im_l_i)); // v12: Im lower wing
     im_l_i += vl;
 
     // Butterfly upper wing
-    asm volatile("vfadd.vv v8, v0, v2");  // v8: Re butterfly output upper wing
-    asm volatile("vfadd.vv v12, v4, v6"); // v12: Im butterfly output upper wing
+    asm volatile("vfadd.vv v16, v0, v4"); // v16: Re butterfly output upper wing
+    asm volatile(
+        "vfadd.vv v20, v8, v12"); // v20: Im butterfly output upper wing
     // Butterfly lower wing
-    asm volatile("vfsub.vv v10, v0, v2"); // v10: Re butterfly output upper wing
-    asm volatile("vfsub.vv v14, v4, v6"); // v14: Im butterfly output upper wing
+    asm volatile("vfsub.vv v0, v0, v4");  // v0: Re butterfly output upper wing
+    asm volatile("vfsub.vv v4, v8, v12"); // v4: Im butterfly output upper wing
 
     // Load the twiddle vector
-    asm volatile("vle64.v v16, (%0);" ::"r"(re_t)); // v16: Re twi
+    asm volatile("vle64.v v8, (%0);" ::"r"(re_t)); // v8: Re twi
     re_t += vl;
-    asm volatile("vle64.v v18, (%0);" ::"r"(im_t)); // v18: Im twi
+    asm volatile("vle64.v v12, (%0);" ::"r"(im_t)); // v12: Im twi
     im_t += vl;
 
     // Twiddle the lower wing
-    asm volatile("vfmul.vv v20, v10, v16");
-    asm volatile("vfnmsac.vv v20, v14, v18"); // v20: Re butterfly output
-                                              // twiddled lower wing
-    asm volatile("vfmul.vv v22, v10, v18");
-    asm volatile("vfmacc.vv v22, v14, v16"); // v22: Im butterfly output
+    asm volatile("vfmul.vv v24, v0, v8");
+    asm volatile("vfnmsac.vv v24, v4, v12"); // v24: Re butterfly output
                                              // twiddled lower wing
+    asm volatile("vfmul.vv v28, v0, v12");
+    asm volatile("vfmacc.vv v28, v4, v8"); // v28: Im butterfly output
+                                           // twiddled lower wing
 
     // Store 1:1 the output result
-    asm volatile("vse64.v v8, (%0)" ::"r"(re_u_o));
+    asm volatile("vse64.v v16, (%0)" ::"r"(re_u_o));
     re_u_o += vl;
-    asm volatile("vse64.v v12, (%0)" ::"r"(im_u_o));
+    asm volatile("vse64.v v20, (%0)" ::"r"(im_u_o));
     im_u_o += vl;
-    asm volatile("vse64.v v20, (%0)" ::"r"(re_l_o));
+    asm volatile("vse64.v v24, (%0)" ::"r"(re_l_o));
     re_l_o += vl;
-    asm volatile("vse64.v v22, (%0)" ::"r"(im_l_o));
+    asm volatile("vse64.v v28, (%0)" ::"r"(im_l_o));
     im_l_o += vl;
   }
 }
diff --git a/sw/spatzBenchmarks/dp-fft/kernel/fft.h b/sw/spatzBenchmarks/dp-fft/kernel/fft.h
@@ -28,7 +28,8 @@ inline void fft_sc(double *s, double *buf, const double *twi,
                    const unsigned int nfft) __attribute__((always_inline));
 
 // Dual-core
-inline void fft_2c(double *s, const double *twi, const unsigned int nfft,
-                   const unsigned int cid) __attribute__((always_inline));
+inline void fft_2c(const double *s, double *buf, const double *twi,
+                   const unsigned int nfft, const unsigned int cid)
+    __attribute__((always_inline));
 
 #endif
diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c
@@ -80,9 +80,9 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Calculate pointers for the second butterfly onwards
-  double *s_ = samples + cid * (NFFT >> 1);
-  double *buf_ = buffer + cid * (NFFT >> 1);
-  double *twi_ = twiddle + (NFFT >> 2);
+  double *s_ = samples + cid * (NFFT >> 2);
+  double *buf_ = buffer + cid * (NFFT >> 2);
+  double *twi_ = twiddle + (NFFT >> 1);
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
@@ -95,7 +95,7 @@ int main() {
     start_kernel();
 
   // First stage
-  fft_2c(samples, twiddle, NFFT, cid);
+  fft_2c(samples, buffer, twiddle, NFFT, cid);
 
   // Wait for all cores to finish the first stage
   snrt_cluster_hw_barrier();

diff --git a/sw/spatzBenchmarks/dp-fft/script/gen_data.py b/sw/spatzBenchmarks/dp-fft/script/gen_data.py
@@ -235,7 +235,7 @@ def main():
         tbuf[N_T_BUF:2 * N_T_BUF] = twiddle_v_s[1::2]
         # Attach 1bf img part
         twiddle_vec_reim = np.concatenate(
-            (twiddle_vec_reim[:N_TWID_V], tbuf[N_T_BUF:N_T_BUF + NFFT // 2], twiddle_vec_reim[N_TWID_V:]))
+            (twiddle_vec_reim[:N_TWID_V], tbuf[N_T_BUF:N_T_BUF + NFFTh // 2], twiddle_vec_reim[N_TWID_V:]))
         # Attach 1bf real part
         twiddle_vec_reim = np.concatenate((tbuf[:NFFTh // 2], twiddle_vec_reim))