Skip to content

Commit

Permalink
clear the L2 cache between consecutive invokations of our microbenchm…
Browse files Browse the repository at this point in the history
…arks to get reliable results
  • Loading branch information
ngc92 committed Apr 30, 2024
1 parent 2490f78 commit d7813d2
Showing 1 changed file with 25 additions and 7 deletions.
32 changes: 25 additions & 7 deletions dev/cuda/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
printf("%f %f\n", cpu_reference[i], (T)out_gpu[i]);
}
// ensure correctness for all elements. We can set an "ignore" mask by writing NaN
if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && !isnan(cpu_reference[i])) {
if (fabs(cpu_reference[i] - (T)out_gpu[i]) > tolerance && isfinite(cpu_reference[i])) {
printf("Mismatch of %s at %d: CPU_ref: %f vs GPU: %f\n", name, i, cpu_reference[i], (T)out_gpu[i]);
nfaults ++;
if (nfaults >= 10) {
Expand All @@ -169,17 +169,35 @@ void validate_result(D* device_result, const T* cpu_reference, const char* name,
template<class Kernel, class... KernelArgs>
float benchmark_kernel(int repeats, Kernel kernel, KernelArgs&&... kernel_args) {
cudaEvent_t start, stop;
// prepare buffer to scrub L2 cache between benchmarks
// just memset a large dummy array, recommended by
// https://stackoverflow.com/questions/31429377/how-can-i-clear-flush-the-l2-cache-and-the-tlb-of-a-gpu
// and apparently used in nvbench.
int deviceIdx = 0;
cudaCheck(cudaSetDevice(deviceIdx));
cudaDeviceProp deviceProp;
cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceIdx));
void* flush_buffer;
cudaCheck(cudaMalloc(&flush_buffer, deviceProp.l2CacheSize));

cudaCheck(cudaEventCreate(&start));
cudaCheck(cudaEventCreate(&stop));
cudaCheck(cudaEventRecord(start, nullptr));
float elapsed_time = 0.f;
for (int i = 0; i < repeats; i++) {
// clear L2
cudaCheck(cudaMemset(flush_buffer, 0, deviceProp.l2CacheSize));
// now we can start recording the timing of the kernel
cudaCheck(cudaEventRecord(start, nullptr));
kernel(std::forward<KernelArgs>(kernel_args)...);
cudaCheck(cudaEventRecord(stop, nullptr));
cudaCheck(cudaEventSynchronize(start));
cudaCheck(cudaEventSynchronize(stop));
float single_call;
cudaCheck(cudaEventElapsedTime(&single_call, start, stop));
elapsed_time += single_call;
}
cudaCheck(cudaEventRecord(stop, nullptr));
cudaCheck(cudaEventSynchronize(start));
cudaCheck(cudaEventSynchronize(stop));
float elapsed_time;
cudaCheck(cudaEventElapsedTime(&elapsed_time, start, stop));

cudaCheck(cudaFree(flush_buffer));

return elapsed_time / repeats;
}

0 comments on commit d7813d2

Please sign in to comment.