Skip to content

Commit

Permalink
CUDA_KERNEL_ERROR in gpu_conv
Browse files Browse the repository at this point in the history
  • Loading branch information
mblum94 authored and uecker committed Dec 9, 2024
1 parent 14596e7 commit 306ef4b
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions src/num/gpu_conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#define BLOCKSIZE 1024


template <int DIMS, typename T>
template <int DIMS, typename T>
struct im2col_descriptor {

T NC; // number channels
Expand All @@ -50,7 +50,7 @@ struct im2col_descriptor {
bool triv_strides_dilation; // trivial dilation and strides
};

template <int DIMS, typename T>
template <int DIMS, typename T>
static struct im2col_descriptor<DIMS, T>get_im2col_descriptor(const long odims[5], const long idims[5], const long kdims[5], const long dilation[5], const long strides[5])
{
struct im2col_descriptor<DIMS, T>config;
Expand Down Expand Up @@ -85,7 +85,7 @@ static struct im2col_descriptor<DIMS, T>get_im2col_descriptor(const long odims[5

if (!((1 < odims[i]) || (1 < kdims[i])))
continue;

assert(j < DIMS);

config.odims[j] = odims[i];
Expand Down Expand Up @@ -120,7 +120,7 @@ static struct im2col_descriptor<DIMS, T>get_im2col_descriptor(const long odims[5
}

// loop over out-dims and krn-dims and copy elements from input (copies one element per thread)
template <int DIMS, typename T, bool transp>
template <int DIMS, typename T, bool transp>
__global__ static void kern_im2col_valid(struct im2col_descriptor<DIMS, T> config, cuFloatComplex* dst, const cuFloatComplex* src)
{
int start = threadIdx.x + blockDim.x * blockIdx.x;
Expand Down Expand Up @@ -167,7 +167,7 @@ __global__ static void kern_im2col_valid(struct im2col_descriptor<DIMS, T> confi
}

// loop over in-dims and copy elements from input to all corresponding output position
template <int DIMS, typename T, bool transp>
template <int DIMS, typename T, bool transp>
__global__ static void kern_im2col_valid_no_dil_str(struct im2col_descriptor<DIMS, T> config, cuFloatComplex* dst, const cuFloatComplex* src)
{
int start = threadIdx.x + blockDim.x * blockIdx.x;
Expand Down Expand Up @@ -212,13 +212,13 @@ __global__ static void kern_im2col_valid_no_dil_str(struct im2col_descriptor<DIM
copy = copy && (idx_k[j] <= idx_i[j]) && (idx_i[j] < idx_k[j] + config.odims[j]);

index += (idx_i[j] - idx_k[j]) * o_stride + idx_k[j] * k_stride;

o_stride *= config.odims[j];
k_stride *= config.kdims[j];
}

if (copy) {

if (transp)
tmp = cuCaddf(tmp, src[index]);
else
Expand All @@ -231,7 +231,7 @@ __global__ static void kern_im2col_valid_no_dil_str(struct im2col_descriptor<DIM
}
}

template <int DIMS, typename T, bool transp>
template <int DIMS, typename T, bool transp>
static void cuda_im2col_int(_Complex float* dst, const _Complex float* src, const long odims[5], const long idims[5], const long kdims[5], const long dilation[5], const long strides[5])
{
struct im2col_descriptor<DIMS, T> config = get_im2col_descriptor<DIMS, T>(odims, idims, kdims, dilation, strides);
Expand All @@ -248,23 +248,25 @@ static void cuda_im2col_int(_Complex float* dst, const _Complex float* src, cons


if (func1) {

const void* func = (const void*)kern_im2col_valid_no_dil_str<DIMS, T, transp>;
kern_im2col_valid_no_dil_str<DIMS, T, transp><<<getGridSize(config.N_in_elements, func), getBlockSize(config.N_in_elements, func), 0, cuda_get_stream() >>>(config, (cuFloatComplex*) dst, (cuFloatComplex*) src);
CUDA_KERNEL_ERROR;
return;
}

if (func2) {

const void* func = (const void*)kern_im2col_valid<DIMS, T, transp>;
kern_im2col_valid<DIMS, T, transp><<<getGridSize(config.N_in_elements, func), getBlockSize(config.N_in_elements, func), 0, cuda_get_stream() >>>(config, (cuFloatComplex*) dst, (cuFloatComplex*) src);
CUDA_KERNEL_ERROR;
return;
}

assert(0);
}

template <bool transp>
template <bool transp>
static void cuda_im2col_int2(_Complex float* dst, const _Complex float* src, const long odims[5], const long idims[5], const long kdims[5], const long dilation[5], const long strides[5])
{
long Nout = idims[1] * md_calc_size(3, kdims + 2) * md_calc_size(3, odims + 2);
Expand All @@ -273,7 +275,7 @@ static void cuda_im2col_int2(_Complex float* dst, const _Complex float* src, con
for (int i = 0 ; i < 3; i++)
if (1 == odims[DIMS + 1] * kdims[DIMS + 1] * idims[DIMS + 1] * (NULL != dilation ? dilation[DIMS + 1] : 1) * (NULL != strides ? strides[DIMS + 1] : 1))
DIMS --;

DIMS = 3;

switch (DIMS) {
Expand All @@ -284,14 +286,14 @@ static void cuda_im2col_int2(_Complex float* dst, const _Complex float* src, con
else
cuda_im2col_int<1, uint64_t, transp>(dst, src, odims, idims, kdims, dilation, strides);
break;

case 2:
if (INT32_MAX / 2 > Nout)
cuda_im2col_int<2, uint32_t, transp>(dst, src, odims, idims, kdims, dilation, strides);
else
cuda_im2col_int<2, uint64_t, transp>(dst, src, odims, idims, kdims, dilation, strides);
break;

case 3:
if (INT32_MAX / 2 > Nout)
cuda_im2col_int<3, uint32_t, transp>(dst, src, odims, idims, kdims, dilation, strides);
Expand Down

0 comments on commit 306ef4b

Please sign in to comment.