clear the L2 cache between consecutive invokations of our microbenchm…

…arks to get reliable results
Zhongeating · Apr 30, 2024 · d7813d2 · d7813d2
1 parent 2490f78
commit d7813d2
Showing 1 changed file with 25 additions and 7 deletions.
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
@@ -148,7 +148,7 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
             printf("%f %f\n", cpu_reference[i], (T)out_gpu[i]);
         }
         // ensure correctness for all elements. We can set an "ignore" mask by writing NaN
-        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && !isnan(cpu_reference[i])) {
+        if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && isfinite(cpu_reference[i])) {
             printf("Mismatch of %s at %d: CPU_ref: %f vs GPU: %f\n", name, i, cpu_reference[i], (T)out_gpu[i]);
             nfaults ++;
             if (nfaults >= 10) {
@@ -169,17 +169,35 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
 template<class Kernel, class... KernelArgs>
 float benchmark_kernel(int repeats, Kernel kernel, KernelArgs&&... kernel_args) {
     cudaEvent_t start, stop;
+    // prepare buffer to scrub L2 cache between benchmarks
+    // just memset a large dummy array, recommended by
+    // https://stackoverflow.com/questions/31429377/how-can-i-clear-flush-the-l2-cache-and-the-tlb-of-a-gpu
+    // and apparently used in nvbench.
+    int deviceIdx = 0;
+    cudaCheck(cudaSetDevice(deviceIdx));
+    cudaDeviceProp deviceProp;
+    cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceIdx));
+    void* flush_buffer;
+    cudaCheck(cudaMalloc(&flush_buffer, deviceProp.l2CacheSize));
+
     cudaCheck(cudaEventCreate(&start));
     cudaCheck(cudaEventCreate(&stop));
-    cudaCheck(cudaEventRecord(start, nullptr));
+    float elapsed_time = 0.f;
     for (int i = 0; i < repeats; i++) {
+        // clear L2
+        cudaCheck(cudaMemset(flush_buffer, 0, deviceProp.l2CacheSize));
+        // now we can start recording the timing of the kernel
+        cudaCheck(cudaEventRecord(start, nullptr));
         kernel(std::forward<KernelArgs>(kernel_args)...);
+        cudaCheck(cudaEventRecord(stop, nullptr));
+        cudaCheck(cudaEventSynchronize(start));
+        cudaCheck(cudaEventSynchronize(stop));
+        float single_call;
+        cudaCheck(cudaEventElapsedTime(&single_call, start, stop));
+        elapsed_time += single_call;
     }
-    cudaCheck(cudaEventRecord(stop, nullptr));
-    cudaCheck(cudaEventSynchronize(start));
-    cudaCheck(cudaEventSynchronize(stop));
-    float elapsed_time;
-    cudaCheck(cudaEventElapsedTime(&elapsed_time, start, stop));
+
+    cudaCheck(cudaFree(flush_buffer));
 
     return elapsed_time / repeats;
 }