[Doc][Dataloading] Expand documentation of AsyncTransferer (dmlc#2313)

* Update docs * Make non-default streams non-blocking
feixian15 · Nov 2, 2020 · d453d72 · d453d72
1 parent f673fc2
commit d453d72
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 3 deletions.
diff --git a/docs/source/api/python/dgl.dataloading.rst b/docs/source/api/python/dgl.dataloading.rst
@@ -56,6 +56,11 @@ Async Copying to/from GPUs
 Data can be copied from the CPU to the GPU, or from the GPU to the CPU,
 while the GPU is being used for
 computation, using the :class:`AsyncTransferer`.
+For the transfer to be fully asynchronous, the context the
+:class:`AsyncTranserer`
+is created with must be a GPU context, and the input tensor must be in 
+pinned memory.
+
 
 .. autoclass:: AsyncTransferer
     :members: __init__, async_copy

diff --git a/python/dgl/dataloading/async_transferer.py b/python/dgl/dataloading/async_transferer.py
@@ -38,7 +38,21 @@ def wait(self):
 
 class AsyncTransferer(object):
     """ Class for initiating asynchronous copies to/from the GPU on a second
-    GPU stream. """
+    GPU stream.
+
+    To initiate a transfer to a GPU:
+
+    >>> tensor_cpu = torch.ones(100000).pin_memory()
+    >>> transferer = dgl.dataloading.AsyncTransferer(torch.device(0))
+    >>> future = transferer.async_copy(tensor_cpu, torch.device(0))
+
+    And then to wait for the transfer to finish and get a copy of the tensor on
+    the GPU.
+
+    >>> tensor_gpu = future.wait()
+
+
+    """
     def __init__(self, device):
         """ Create a new AsyncTransferer object.
 
@@ -55,7 +69,12 @@ def __init__(self, device):
         self._handle = _CAPI_DGLAsyncTransfererCreate(ctx)
 
     def async_copy(self, tensor, device):
-        """ Initiate an asynchronous copy on the internal stream.
+        """ Initiate an asynchronous copy on the internal stream. For this call
+        to be asynchronous, the context the AsyncTranserer is created with must
+        be a GPU context, and the input tensor must be in pinned memory.
+
+        Currently, transfers from the GPU to the CPU, and CPU to CPU, will
+        be synchronous.
 
         Parameters
         ----------

diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
@@ -138,7 +138,8 @@ class CUDADeviceAPI final : public DeviceAPI {
   DGLStreamHandle CreateStream(DGLContext ctx) {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     cudaStream_t retval;
-    CUDA_CALL(cudaStreamCreate(&retval));
+    // make sure the legacy default stream won't block on this stream
+    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
     return static_cast<DGLStreamHandle>(retval);
   }