[hotfix] Skip CUDA kernel launch when number of blocks/threads is zer…

…o. (dmlc#2144) * upd * upd * upd * upd * lint * upd * upd * fmt Co-authored-by: Quan (Andy) Gan <[email protected]>
NiklasBeierl · Sep 10, 2020 · 2c04ecb · 2c04ecb
1 parent 5f44a4e
commit 2c04ecb
Show file tree

Hide file tree

Showing 14 changed files with 97 additions and 53 deletions.
diff --git a/src/array/cuda/array_index_select.cu b/src/array/cuda/array_index_select.cu
@@ -36,7 +36,8 @@ NDArray IndexSelect(NDArray array, IdArray index) {
   DType* ret_data = static_cast<DType*>(ret->data);
   const int nt = cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
-  _IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
+  CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
+      array_data, idx_data, len, ret_data);
   return ret;
 }
 

diff --git a/src/array/cuda/array_op_impl.cu b/src/array/cuda/array_op_impl.cu
@@ -36,7 +36,8 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
       lhs_data, rhs_data, ret_data, len);
   return ret;
 }
@@ -85,7 +86,8 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
       lhs_data, rhs, ret_data, len);
   return ret;
 }
@@ -135,7 +137,8 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
       lhs, rhs_data, ret_data, len);
   return ret;
 }
@@ -183,7 +186,8 @@ IdArray UnaryElewise(IdArray lhs) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
-  _UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_UnaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
       lhs_data, ret_data, len);
   return ret;
 }
@@ -211,7 +215,8 @@ IdArray Full(IdType val, int64_t length, DLContext ctx) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
-  _FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
+  CUDA_KERNEL_CALL((_FullKernel<IdType>), nb, nt, 0, thr_entry->stream,
+      ret_data, length, val);
   return ret;
 }
 
@@ -242,7 +247,9 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
-  _RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
+  CUDA_KERNEL_CALL((_RangeKernel<IdType>),
+      nb, nt, 0, thr_entry->stream,
+      ret_data, low, length);
   return ret;
 }
 
@@ -270,10 +277,12 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
   if (bits == 32) {
-    _CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((_CastKernel<IdType, int32_t>),
+        nb, nt, 0, thr_entry->stream,
         static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data), length);
   } else {
-    _CastKernel<IdType, int64_t><<<nb, nt, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((_CastKernel<IdType, int64_t>),
+        nb, nt, 0, thr_entry->stream,
         static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data), length);
   }
   return ret;

diff --git a/src/array/cuda/array_scatter.cu b/src/array/cuda/array_scatter.cu
@@ -33,7 +33,8 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   const int nt = cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
-  _ScatterKernel<<<nb, nt, 0, thr_entry->stream>>>(idx, val, len, outd);
+  CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, thr_entry->stream,
+      idx, val, len, outd);
 }
 
 template void Scatter_<kDLGPU, int32_t, int32_t>(IdArray, NDArray, NDArray);

diff --git a/src/array/cuda/coo2csr.cu b/src/array/cuda/coo2csr.cu
@@ -131,7 +131,8 @@ CSRMatrix COOToCSR<kDLGPU, int64_t>(COOMatrix coo) {
   const int nt = cuda::FindNumThreads(coo.num_rows);
   const int nb = (coo.num_rows + nt - 1) / nt;
   IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
-  _SortedSearchKernelUpperBound<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
+      nb, nt, 0, thr_entry->stream,
       coo.row.Ptr<int64_t>(), nnz,
       rowids.Ptr<int64_t>(), coo.num_rows,
       indptr.Ptr<int64_t>() + 1);

diff --git a/src/array/cuda/coo_sort.cu b/src/array/cuda/coo_sort.cu
@@ -155,7 +155,7 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
   int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
   const int nt = cuda::FindNumThreads(nnz);
   const int nb = (nnz + nt - 1) / nt;
-  _COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, thr_entry->stream,
       coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
       nnz, row_flags, col_flags);
 

diff --git a/src/array/cuda/csr2coo.cu b/src/array/cuda/csr2coo.cu
@@ -91,7 +91,8 @@ COOMatrix CSRToCOO<kDLGPU, int64_t>(CSRMatrix csr) {
 
   const int nt = cuda::FindNumThreads(csr.num_rows);
   const int nb = (csr.num_rows + nt - 1) / nt;
-  _RepeatKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_RepeatKernel,
+      nb, nt, 0, thr_entry->stream,
       rowids.Ptr<int64_t>(), row_nnz.Ptr<int64_t>(),
       csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
       csr.num_rows);

diff --git a/src/array/cuda/csr_sort.cu b/src/array/cuda/csr_sort.cu
@@ -44,7 +44,8 @@ bool CSRIsSorted(CSRMatrix csr) {
   int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
   const int nt = cuda::FindNumThreads(csr.num_rows);
   const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentIsSorted,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       csr.num_rows, flags);
   bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);

diff --git a/src/array/cuda/sddmm.cuh b/src/array/cuda/sddmm.cuh
@@ -182,14 +182,13 @@ void SDDMMCoo(
   const bool use_idx = !IsNullArray(coo.data);
 
   BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
-    SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+        nblks, nthrs, 0, thr_entry->stream,
         lhs_data, rhs_data, out_data,
         row, col, edge_map,
         coo.num_rows, coo.num_cols, nnz, reduce_dim,
         lhs_off, rhs_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
   });
 }
 
@@ -233,14 +232,13 @@ void SDDMMCsr(
   const bool use_idx = !IsNullArray(csr.data);
 
   BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
-    SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+        nblks, nthrs, 0, thr_entry->stream,
         lhs_data, rhs_data, out_data,
         indptr, indices, edge_map,
         N, M, E, reduce_dim,
         lhs_off, rhs_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
   });
 }
 

diff --git a/src/array/cuda/spmat_op_impl_csr.cu b/src/array/cuda/spmat_op_impl_csr.cu
@@ -61,7 +61,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
   IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
   const IdType* data = nullptr;
   // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      1, 1, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
       rows.Ptr<IdType>(), cols.Ptr<IdType>(),
       1, 1, 1,
@@ -88,7 +89,8 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
   const int nb = (rstlen + nt - 1) / nt;
   const IdType* data = nullptr;
   // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
       row.Ptr<IdType>(), col.Ptr<IdType>(),
       row_stride, col_stride, rstlen,
@@ -134,7 +136,8 @@ bool CSRHasDuplicate(CSRMatrix csr) {
   int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
   const int nt = cuda::FindNumThreads(csr.num_rows);
   const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentHasNoDuplicate<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentHasNoDuplicate,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       csr.num_rows, flags);
   bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
@@ -182,7 +185,8 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
   IdType* rst_data = static_cast<IdType*>(rst->data);
   const int nt = cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
-  _CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_CSRGetRowNNZKernel,
+      nb, nt, 0, thr_entry->stream,
       vid_data, indptr_data, rst_data, len);
   return rst;
 }
@@ -281,13 +285,15 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
   const int nb = (len + nt - 1) / nt;
   // Copy indices.
   IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
-  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentCopyKernel,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       rows.Ptr<IdType>(), 1, len,
       ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
   // Copy data.
   IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
-  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentCopyKernel,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
       rows.Ptr<IdType>(), 1, len,
       ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
@@ -321,7 +327,8 @@ IdArray CSRGetData(CSRMatrix csr, NDArray row, NDArray col) {
   const int nt = cuda::FindNumThreads(rstlen);
   const int nb = (rstlen + nt - 1) / nt;
   // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
       row.Ptr<IdType>(), col.Ptr<IdType>(),
@@ -422,7 +429,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
   IdArray mask = Full(0, nnz, nbits, ctx);
   const int nt = cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
-  _SegmentMaskKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentMaskKernel,
+      nb, nt, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
       row.Ptr<IdType>(), col.Ptr<IdType>(),
       row_stride, col_stride, len,
@@ -437,7 +445,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
   IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
   const int nt2 = cuda::FindNumThreads(idx->shape[0]);
   const int nb2 = (idx->shape[0] + nt - 1) / nt;
-  _SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SortedSearchKernel,
+      nb2, nt2, 0, thr_entry->stream,
       csr.indptr.Ptr<IdType>(), csr.num_rows,
       idx.Ptr<IdType>(), idx->shape[0],
       ret_row.Ptr<IdType>());
@@ -512,10 +521,11 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
   IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
   const int nt = cuda::FindNumThreads(csr.num_rows);
   const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentMaskColKernel<<<nb, nt, 0, thr_entry->stream>>>(
-    csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
-    cols.Ptr<IdType>(), cols->shape[0],
-    mask.Ptr<IdType>(), count.Ptr<IdType>());
+  CUDA_KERNEL_CALL(_SegmentMaskColKernel,
+      nb, nt, 0, thr_entry->stream,
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
+      cols.Ptr<IdType>(), cols->shape[0],
+      mask.Ptr<IdType>(), count.Ptr<IdType>());
 
   IdArray idx = AsNumBits(NonZero(mask), nbits);
   if (idx->shape[0] == 0)

diff --git a/src/array/cuda/spmm.cu b/src/array/cuda/spmm.cu
@@ -21,7 +21,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = FindNumThreads(length);
   int nb = (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
-  cuda::_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
+  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
 }
 
 }  // namespace

diff --git a/src/array/cuda/spmm.cuh b/src/array/cuda/spmm.cuh
@@ -224,7 +224,8 @@ void SpMMCoo(
   int64_t out_size = out.NumElements();
   const int nt = FindNumThreads(out_size);
   const int nb = (out_size + nt - 1) / nt;
-  _FillKernel<<<nb, nt, 0, thr_entry->stream>>>(out_data, out_size, ReduceOp::zero);
+  CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
+      out_data, out_size, ReduceOp::zero);
 
   const int ntx = FindNumThreads(len);
   const int nty = CUDA_MAX_NUM_THREADS / ntx;
@@ -236,23 +237,21 @@ void SpMMCoo(
   const bool use_idx = !IsNullArray(coo.data);
 
   BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
-    SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+        nblks, nthrs, 0, thr_entry->stream,
         ufeat_data, efeat_data, out_data, argu_data, arge_data,
         row, col, edge_map,
         N, M, E,
         ubcast_off, ebcast_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
     if (ReduceOp::require_arg) {
-      ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-        <<<nblks, nthrs, 0, thr_entry->stream>>>(
+      CUDA_KERNEL_CALL((ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+          nblks, nthrs, 0, thr_entry->stream,
           ufeat_data, efeat_data, out_data, argu_data, arge_data,
           row, col, edge_map,
           N, M, E,
           ubcast_off, ebcast_off,
-          lhs_len, rhs_len, len
-        );
+          lhs_len, rhs_len, len);
     }
   });
 }
@@ -303,14 +302,13 @@ void SpMMCsr(
   const bool use_idx = !IsNullArray(csr.data);
 
   BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
-    SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+        nblks, nthrs, 0, thr_entry->stream,
         ufeat_data, efeat_data, out_data, argu_data, arge_data,
         indptr, indices, edge_map,
         csr.num_rows, csr.num_cols, efeat->shape[0],
         ubcast_off, ebcast_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len)
   });
 }
 

diff --git a/src/geometry/cuda/geometry_op_impl.cu b/src/geometry/cuda/geometry_op_impl.cu
@@ -108,9 +108,10 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
   // sample for each cloud in the batch
   IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
 
-  fps_kernel<<<batch_size, THREADS, 0, thr_entry->stream>>>(
-    array_data, batch_size, sample_points,
-    point_in_batch, dim, start_idx_data, dist_data, ret_data);
+  CUDA_KERNEL_CALL(fps_kernel,
+      batch_size, THREADS, 0, thr_entry->stream,
+      array_data, batch_size, sample_points,
+      point_in_batch, dim, start_idx_data, dist_data, ret_data);
 }
 
 template void FarthestPointSampler<kDLGPU, float, int32_t>(

diff --git a/src/kernel/cuda/utils.cu b/src/kernel/cuda/utils.cu
@@ -25,7 +25,7 @@ void Fill(const DLContext& ctx, DType* ptr, size_t length, DType val) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   int nt = utils::FindNumThreads(length, 1024);
   int nb = (length + nt - 1) / nt;
-  _FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
+  CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
 }
 
 template void Fill<kDLGPU, float>(const DLContext& ctx, float* ptr, size_t length, float val);