Skip to content

Commit

Permalink
[hotfix] Skip CUDA kernel launch when number of blocks/threads is zer…
Browse files Browse the repository at this point in the history
…o. (dmlc#2144)

* upd

* upd

* upd

* upd

* lint

* upd

* upd

* fmt

Co-authored-by: Quan (Andy) Gan <[email protected]>
  • Loading branch information
yzh119 and BarclayII authored Sep 10, 2020
1 parent 5f44a4e commit 2c04ecb
Show file tree
Hide file tree
Showing 14 changed files with 97 additions and 53 deletions.
3 changes: 2 additions & 1 deletion src/array/cuda/array_index_select.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ NDArray IndexSelect(NDArray array, IdArray index) {
DType* ret_data = static_cast<DType*>(ret->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
array_data, idx_data, len, ret_data);
return ret;
}

Expand Down
25 changes: 17 additions & 8 deletions src/array/cuda/array_op_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, rhs_data, ret_data, len);
return ret;
}
Expand Down Expand Up @@ -85,7 +86,8 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, rhs, ret_data, len);
return ret;
}
Expand Down Expand Up @@ -135,7 +137,8 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs, rhs_data, ret_data, len);
return ret;
}
Expand Down Expand Up @@ -183,7 +186,8 @@ IdArray UnaryElewise(IdArray lhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_UnaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, ret_data, len);
return ret;
}
Expand Down Expand Up @@ -211,7 +215,8 @@ IdArray Full(IdType val, int64_t length, DLContext ctx) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
CUDA_KERNEL_CALL((_FullKernel<IdType>), nb, nt, 0, thr_entry->stream,
ret_data, length, val);
return ret;
}

Expand Down Expand Up @@ -242,7 +247,9 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
CUDA_KERNEL_CALL((_RangeKernel<IdType>),
nb, nt, 0, thr_entry->stream,
ret_data, low, length);
return ret;
}

Expand Down Expand Up @@ -270,10 +277,12 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
if (bits == 32) {
_CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_CastKernel<IdType, int32_t>),
nb, nt, 0, thr_entry->stream,
static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data), length);
} else {
_CastKernel<IdType, int64_t><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_CastKernel<IdType, int64_t>),
nb, nt, 0, thr_entry->stream,
static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data), length);
}
return ret;
Expand Down
3 changes: 2 additions & 1 deletion src/array/cuda/array_scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_ScatterKernel<<<nb, nt, 0, thr_entry->stream>>>(idx, val, len, outd);
CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, thr_entry->stream,
idx, val, len, outd);
}

template void Scatter_<kDLGPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
Expand Down
3 changes: 2 additions & 1 deletion src/array/cuda/coo2csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ CSRMatrix COOToCSR<kDLGPU, int64_t>(COOMatrix coo) {
const int nt = cuda::FindNumThreads(coo.num_rows);
const int nb = (coo.num_rows + nt - 1) / nt;
IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
_SortedSearchKernelUpperBound<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
nb, nt, 0, thr_entry->stream,
coo.row.Ptr<int64_t>(), nnz,
rowids.Ptr<int64_t>(), coo.num_rows,
indptr.Ptr<int64_t>() + 1);
Expand Down
2 changes: 1 addition & 1 deletion src/array/cuda/coo_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
_COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, thr_entry->stream,
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);

Expand Down
3 changes: 2 additions & 1 deletion src/array/cuda/csr2coo.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ COOMatrix CSRToCOO<kDLGPU, int64_t>(CSRMatrix csr) {

const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_RepeatKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, thr_entry->stream,
rowids.Ptr<int64_t>(), row_nnz.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows);
Expand Down
3 changes: 2 additions & 1 deletion src/array/cuda/csr_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ bool CSRIsSorted(CSRMatrix csr) {
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentIsSorted,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
Expand Down
14 changes: 6 additions & 8 deletions src/array/cuda/sddmm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,13 @@ void SDDMMCoo(
const bool use_idx = !IsNullArray(coo.data);

BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
nblks, nthrs, 0, thr_entry->stream,
lhs_data, rhs_data, out_data,
row, col, edge_map,
coo.num_rows, coo.num_cols, nnz, reduce_dim,
lhs_off, rhs_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
});
}

Expand Down Expand Up @@ -233,14 +232,13 @@ void SDDMMCsr(
const bool use_idx = !IsNullArray(csr.data);

BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
nblks, nthrs, 0, thr_entry->stream,
lhs_data, rhs_data, out_data,
indptr, indices, edge_map,
N, M, E, reduce_dim,
lhs_off, rhs_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
});
}

Expand Down
36 changes: 23 additions & 13 deletions src/array/cuda/spmat_op_impl_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
const IdType* data = nullptr;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
1, 1, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
rows.Ptr<IdType>(), cols.Ptr<IdType>(),
1, 1, 1,
Expand All @@ -88,7 +89,8 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
row.Ptr<IdType>(), col.Ptr<IdType>(),
row_stride, col_stride, rstlen,
Expand Down Expand Up @@ -134,7 +136,8 @@ bool CSRHasDuplicate(CSRMatrix csr) {
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentHasNoDuplicate<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentHasNoDuplicate,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
Expand Down Expand Up @@ -182,7 +185,8 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
IdType* rst_data = static_cast<IdType*>(rst->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_CSRGetRowNNZKernel,
nb, nt, 0, thr_entry->stream,
vid_data, indptr_data, rst_data, len);
return rst;
}
Expand Down Expand Up @@ -281,13 +285,15 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
const int nb = (len + nt - 1) / nt;
// Copy indices.
IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentCopyKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
// Copy data.
IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentCopyKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
Expand Down Expand Up @@ -321,7 +327,8 @@ IdArray CSRGetData(CSRMatrix csr, NDArray row, NDArray col) {
const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
row.Ptr<IdType>(), col.Ptr<IdType>(),
Expand Down Expand Up @@ -422,7 +429,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray mask = Full(0, nnz, nbits, ctx);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_SegmentMaskKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentMaskKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
row.Ptr<IdType>(), col.Ptr<IdType>(),
row_stride, col_stride, len,
Expand All @@ -437,7 +445,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
const int nt2 = cuda::FindNumThreads(idx->shape[0]);
const int nb2 = (idx->shape[0] + nt - 1) / nt;
_SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SortedSearchKernel,
nb2, nt2, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.num_rows,
idx.Ptr<IdType>(), idx->shape[0],
ret_row.Ptr<IdType>());
Expand Down Expand Up @@ -512,10 +521,11 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentMaskColKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
cols.Ptr<IdType>(), cols->shape[0],
mask.Ptr<IdType>(), count.Ptr<IdType>());
CUDA_KERNEL_CALL(_SegmentMaskColKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
cols.Ptr<IdType>(), cols->shape[0],
mask.Ptr<IdType>(), count.Ptr<IdType>());

IdArray idx = AsNumBits(NonZero(mask), nbits);
if (idx->shape[0] == 0)
Expand Down
2 changes: 1 addition & 1 deletion src/array/cuda/spmm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = FindNumThreads(length);
int nb = (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound.
cuda::_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
}

} // namespace
Expand Down
24 changes: 11 additions & 13 deletions src/array/cuda/spmm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ void SpMMCoo(
int64_t out_size = out.NumElements();
const int nt = FindNumThreads(out_size);
const int nb = (out_size + nt - 1) / nt;
_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(out_data, out_size, ReduceOp::zero);
CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
out_data, out_size, ReduceOp::zero);

const int ntx = FindNumThreads(len);
const int nty = CUDA_MAX_NUM_THREADS / ntx;
Expand All @@ -236,23 +237,21 @@ void SpMMCoo(
const bool use_idx = !IsNullArray(coo.data);

BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
row, col, edge_map,
N, M, E,
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
if (ReduceOp::require_arg) {
ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
row, col, edge_map,
N, M, E,
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
}
});
}
Expand Down Expand Up @@ -303,14 +302,13 @@ void SpMMCsr(
const bool use_idx = !IsNullArray(csr.data);

BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
indptr, indices, edge_map,
csr.num_rows, csr.num_cols, efeat->shape[0],
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len)
});
}

Expand Down
7 changes: 4 additions & 3 deletions src/geometry/cuda/geometry_op_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);

fps_kernel<<<batch_size, THREADS, 0, thr_entry->stream>>>(
array_data, batch_size, sample_points,
point_in_batch, dim, start_idx_data, dist_data, ret_data);
CUDA_KERNEL_CALL(fps_kernel,
batch_size, THREADS, 0, thr_entry->stream,
array_data, batch_size, sample_points,
point_in_batch, dim, start_idx_data, dist_data, ret_data);
}

template void FarthestPointSampler<kDLGPU, float, int32_t>(
Expand Down
2 changes: 1 addition & 1 deletion src/kernel/cuda/utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void Fill(const DLContext& ctx, DType* ptr, size_t length, DType val) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = utils::FindNumThreads(length, 1024);
int nb = (length + nt - 1) / nt;
_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
}

template void Fill<kDLGPU, float>(const DLContext& ctx, float* ptr, size_t length, float val);
Expand Down
Loading

0 comments on commit 2c04ecb

Please sign in to comment.