quda-0.2: added cache initialization for sm_20 (merged from quda r937…

…:938) git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@941 be54200a-260c-0410-bdd7-ce6af2a381ab
alexstrel · Jun 16, 2010 · 05b4ddd · 05b4ddd
1 parent 9e1f58b
commit 05b4ddd
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 2 deletions.
diff --git a/include/dslash_quda.h b/include/dslash_quda.h
@@ -10,6 +10,7 @@ extern "C" {
   extern unsigned long long dslash_quda_flops;
   extern unsigned long long dslash_quda_bytes;
 
+  void initCache(void);
   int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);
 
   // Double precision routines

diff --git a/lib/dslash_quda.cu b/lib/dslash_quda.cu
@@ -1,3 +1,4 @@
+
 #include <stdlib.h>
 #include <stdio.h>
 
@@ -36,7 +37,27 @@ unsigned long long dslash_quda_bytes;
 
 #include <clover_def.h> // kernels for applying the clover term alone
 
-int dslashCudaSharedBytes(Precision precision) {
+// do nothing
+__global__ void dummyKernel() {
+
+}
+
+void initCache() {
+
+#if (__CUDA_ARCH__ >= 200)
+
+  static int firsttime = 1;
+  if (firsttime){	
+    cudaFuncSetCacheConfig(dummyKernel, cudaFuncCachePreferL1);
+    dummyKernel<<<1,1>>>();
+    firsttime=0;
+  }
+
+#endif
+
+}
+
+int dslashCudaSharedBytes(QudaPrecision precision) {
   return BLOCK_DIM*SHARED_FLOATS_PER_THREAD*precision;
 }
 

diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
@@ -70,6 +70,7 @@ void initQuda(int dev)
 
   fprintf(stderr, "QUDA: Using device %d: %s\n", dev, deviceProp.name);
   cudaSetDevice(dev);
+  initCache();
 
   cudaGaugePrecise.even = NULL;
   cudaGaugePrecise.odd = NULL;

diff --git a/tests/blas_test.cu b/tests/blas_test.cu
@@ -310,7 +310,7 @@ int main(int argc, char** argv)
 
       if (threads_max == 0) errorQuda("Autotuning failed for %s kernel", names[i]);
 
-      printf("%-36s Performance maximum at %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n", 
+      printf("%-32s: %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n", 
 	     names[i], threads_max, blocks_max, gflops_max, gbytes_max);
 
       threads[i][prec] = threads_max;