Skip to content

Commit

Permalink
Added support for Pascal in Makefile. Changes to bench_memcpy
Browse files Browse the repository at this point in the history
  • Loading branch information
ap-hynninen committed Oct 12, 2016
1 parent cbbd73a commit 604887c
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 22 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ CUDAC = nvcc
GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
GENCODE_SM50 := -gencode arch=compute_50,code=sm_50
GENCODE_SM52 := -gencode arch=compute_52,code=sm_52
GENCODE_FLAGS := $(GENCODE_SM35) $(GENCODE_SM52)
GENCODE_SM60 := -gencode arch=compute_60,code=sm_60
GENCODE_FLAGS := $(GENCODE_SM35) $(GENCODE_SM52) $(GENCODE_SM60)

#######################################################

Expand Down
6 changes: 4 additions & 2 deletions src/CudaMemcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stre
// -----------------------------------------------------------------------------------

// Explicit instances
template void scalarCopy<double>(const int n, const double* data_in, double* data_out, cudaStream_t stream);
template void vectorCopy<double>(const int n, double* data_in, double* data_out, cudaStream_t stream);
template void scalarCopy<int>(const int n, const int* data_in, int* data_out, cudaStream_t stream);
template void scalarCopy<long long int>(const int n, const long long int* data_in, long long int* data_out, cudaStream_t stream);
template void vectorCopy<int>(const int n, int* data_in, int* data_out, cudaStream_t stream);
template void vectorCopy<long long int>(const int n, long long int* data_in, long long int* data_out, cudaStream_t stream);
void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream);
46 changes: 27 additions & 19 deletions src/cutt_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ template <typename T> bool bench5(int numElem, int ratio);
bool bench6();
template <typename T> bool bench7();
template <typename T> bool bench_input(std::vector<int>& dim, std::vector<int>& permutation);
// bool bench_memcpy(int numElem);
template <typename T> bool bench_memcpy(int numElem);

bool isTrivial(std::vector<int>& permutation);
void getRandomDim(double vol, std::vector<int>& dim);
Expand Down Expand Up @@ -217,6 +217,7 @@ int main(int argc, char *argv[]) {
printf("permutation\n");
printVec(permutation);
}
goto benchOK;
} else {
goto fail;
}
Expand Down Expand Up @@ -252,6 +253,7 @@ int main(int argc, char *argv[]) {
printf("permutation\n");
printVec(permutation);
}
goto benchOK;
} else {
goto fail;
}
Expand Down Expand Up @@ -289,6 +291,7 @@ int main(int argc, char *argv[]) {
printf("permutation\n");
printVec(permutation);
}
goto benchOK;
} else {
goto fail;
}
Expand Down Expand Up @@ -324,12 +327,18 @@ int main(int argc, char *argv[]) {
printf("permutation\n");
printVec(permutation);
}
goto benchOK;
} else {
goto fail;
}
}

// if (!bench_memcpy(200*MILLION)) goto fail;
// Otherwise, do memcopy benchmark
{
bool ok = (elemsize == 4) ? bench_memcpy<int>(benchID) : bench_memcpy<long long int>(benchID);
if (ok) goto benchOK;
goto fail;
}

benchOK:
printf("bench OK\n");
Expand Down Expand Up @@ -809,60 +818,59 @@ void printVec(std::vector<int>& vec) {
printf("\n");
}

/*
//
// Benchmarks memory copy. Returns bandwidth in GB/s
//
template <typename T>
bool bench_memcpy(int numElem) {

std::vector<int> dim(1, numElem);
std::vector<int> permutation(1, 0);

{
cuttTimer timer(8);
cuttTimer timer(sizeof(T));
for (int i=0;i < 4;i++) {
set_device_array<double>((double *)dataOut, -1, numElem);
set_device_array<T>((T *)dataOut, -1, numElem);
cudaCheck(cudaDeviceSynchronize());
timer.start(dim, permutation);
scalarCopy<double>(numElem, (double *)dataIn, (double *)dataOut, 0);
scalarCopy<T>(numElem, (T *)dataIn, (T *)dataOut, 0);
timer.stop();
// printf("%4.2lf GB/s\n", timer.GBs());
printf("%4.2lf GB/s\n", timer.GBs());
}
if (!tester->checkTranspose<long long int>(1, dim.data(), permutation.data(), dataOut)) return false;
if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
printf("scalarCopy %lf GB/s\n", timer.getAverage(1));
}

{
cuttTimer timer(8);
cuttTimer timer(sizeof(T));
for (int i=0;i < 4;i++) {
set_device_array<double>((double *)dataOut, -1, numElem);
set_device_array<T>((T *)dataOut, -1, numElem);
cudaCheck(cudaDeviceSynchronize());
timer.start(dim, permutation);
vectorCopy<double>(numElem, (double *)dataIn, (double *)dataOut, 0);
vectorCopy<T>(numElem, (T *)dataIn, (T *)dataOut, 0);
timer.stop();
// printf("%4.2lf GB/s\n", timer.GBs());
printf("%4.2lf GB/s\n", timer.GBs());
}
if (!tester->checkTranspose<long long int>(1, dim.data(), permutation.data(), dataOut)) return false;
if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
printf("vectorCopy %lf GB/s\n", timer.getAverage(1));
}

{
cuttTimer timer(8);
cuttTimer timer(sizeof(T));
for (int i=0;i < 4;i++) {
set_device_array<double>((double *)dataOut, -1, numElem);
set_device_array<T>((T *)dataOut, -1, numElem);
cudaCheck(cudaDeviceSynchronize());
timer.start(dim, permutation);
memcpyFloat(numElem*2, (float *)dataIn, (float *)dataOut, 0);
memcpyFloat(numElem*sizeof(T)/sizeof(float), (float *)dataIn, (float *)dataOut, 0);
timer.stop();
// printf("%4.2lf GB/s\n", timer.GBs());
printf("%4.2lf GB/s\n", timer.GBs());
}
if (!tester->checkTranspose<long long int>(1, dim.data(), permutation.data(), dataOut)) return false;
if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
printf("memcpyFloat %lf GB/s\n", timer.getAverage(1));
}

return true;
}
*/

void printDeviceInfo() {
int deviceID;
Expand Down

0 comments on commit 604887c

Please sign in to comment.