Skip to content

Commit

Permalink
quda-0.2: added cache initialization for sm_20 (merged from quda r937…
Browse files Browse the repository at this point in the history
…:938)

git-svn-id: http://lattice.bu.edu/qcdalg/cuda/branches/quda-0.2@941 be54200a-260c-0410-bdd7-ce6af2a381ab
  • Loading branch information
rbabich committed Jun 16, 2010
1 parent 9e1f58b commit 05b4ddd
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/dslash_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ extern "C" {
extern unsigned long long dslash_quda_flops;
extern unsigned long long dslash_quda_bytes;

void initCache(void);
int dslashCudaSharedBytes(Precision spinor_prec, int blockDim);

// Double precision routines
Expand Down
23 changes: 22 additions & 1 deletion lib/dslash_quda.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

#include <stdlib.h>
#include <stdio.h>

Expand Down Expand Up @@ -36,7 +37,27 @@ unsigned long long dslash_quda_bytes;

#include <clover_def.h> // kernels for applying the clover term alone

int dslashCudaSharedBytes(Precision precision) {
// do nothing
__global__ void dummyKernel() {

}

void initCache() {

#if (__CUDA_ARCH__ >= 200)

static int firsttime = 1;
if (firsttime){
cudaFuncSetCacheConfig(dummyKernel, cudaFuncCachePreferL1);
dummyKernel<<<1,1>>>();
firsttime=0;
}

#endif

}

int dslashCudaSharedBytes(QudaPrecision precision) {
return BLOCK_DIM*SHARED_FLOATS_PER_THREAD*precision;
}

Expand Down
1 change: 1 addition & 0 deletions lib/interface_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void initQuda(int dev)

fprintf(stderr, "QUDA: Using device %d: %s\n", dev, deviceProp.name);
cudaSetDevice(dev);
initCache();

cudaGaugePrecise.even = NULL;
cudaGaugePrecise.odd = NULL;
Expand Down
2 changes: 1 addition & 1 deletion tests/blas_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ int main(int argc, char** argv)

if (threads_max == 0) errorQuda("Autotuning failed for %s kernel", names[i]);

printf("%-36s Performance maximum at %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n",
printf("%-32s: %d threads per block, %d blocks per grid, Gflops/s = %f, GiB/s = %f\n",
names[i], threads_max, blocks_max, gflops_max, gbytes_max);

threads[i][prec] = threads_max;
Expand Down

0 comments on commit 05b4ddd

Please sign in to comment.