diff --git a/docs/source/api/python/udf.rst b/docs/source/api/python/udf.rst
index 0c85db912640..c668ab6869cf 100644
--- a/docs/source/api/python/udf.rst
+++ b/docs/source/api/python/udf.rst
@@ -4,6 +4,16 @@ User-defined function related data structures
 ==================================================
 
 .. currentmodule:: dgl.udf
+.. automodule:: dgl.udf
+
+There are two types of user-defined functions in DGL:
+
+* **Node UDF** of signature ``NodeBatch -> dict``. The argument represents
+  a batch of nodes. The returned dictionary should have ``str`` type key and ``tensor``
+  type values.
+* **Edge UDF** of signature ``EdgeBatch -> dict``. The argument represents
+  a batch of edges. The returned dictionary should have ``str`` type key and ``tensor``
+  type values.
 
 EdgeBatch
 ---------
diff --git a/python/dgl/graph.py b/python/dgl/graph.py
index fd5ad35ffec0..6d4e18e2837e 100644
--- a/python/dgl/graph.py
+++ b/python/dgl/graph.py
@@ -81,7 +81,11 @@ class DGLGraph(object):
 
     >>> G.add_edges([2, 6, 8], 5)  # three edges: 2->5, 6->5, 8->5
 
-    or multiple edges using tensor type (demo in pytorch syntax).
+    or multiple edges using tensor type
+ 
+    .. note:: Here we use pytorch syntax for demo. The general idea applies
+        to other frameworks with minor syntax change (e.g. replace
+        ``torch.tensor`` with ``mxnet.ndarray``).
 
     >>> import torch as th
     >>> G.add_edges(th.tensor([3, 4, 5]), 1)  # three edges: 3->1, 4->1, 5->1
@@ -132,6 +136,39 @@ class DGLGraph(object):
     >>> G.edata
     {'y' : tensor([[2., 2., 2., 2.],
                    [1., 1., 1., 1.]])}
+
+    **Message Passing:**
+
+    One common operation for updating node features is message passing,
+    where the source nodes send messages through edges to the destinations.
+    With :class:`DGLGraph`, we can do this with :func:`send` and :func:`recv`.
+
+    In the example below, the source nodes add 1 to their node features as
+    the messages and send the messages to the destinations.
+
+    >>> # Define the function for sending messages.
+    >>> def send_source(edges): return {'m': edges.src['x'] + 1}
+    >>> # Set the function defined to be the default message function.
+    >>> G.register_message_func(send_source)
+    >>> # Send messages through all edges.
+    >>> G.send(G.edges())
+
+    Just like you need to go to your mailbox for retrieving mails, the destination
+    nodes also need to receive the messages and potentially update their features.
+
+    >>> # Define a function for summing messages received and replacing the original feature.
+    >>> def simple_reduce(nodes): return {'x': nodes.mailbox['m'].sum(1)}
+    >>> # Set the function defined to be the default message reduce function.
+    >>> G.register_reduce_func(simple_reduce)
+    >>> # All existing edges have node 2 as the destination.
+    >>> # Receive the messages for node 2 and update its feature.
+    >>> G.recv(v=2)
+    >>> G.ndata
+    {'x': tensor([[1., 1., 1., 1., 1.],
+                  [0., 0., 0., 0., 0.],
+                  [3., 3., 3., 3., 3.]])} # 3 = (1 + 1) + (0 + 1)
+
+    For more examples about message passing, please read our tutorials.
     """
     def __init__(self,
                  graph_data=None,
@@ -188,7 +225,12 @@ def add_nodes(self, num, data=None):
         >>> g.number_of_nodes()
         5
 
-        Adding new nodes with features (using PyTorch as example):
+        Adding new nodes with features:
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
         >>> import torch as th
         >>> g.add_nodes(2, {'x': th.ones(2, 4)})    # default zero initializer
         >>> g.ndata['x']
@@ -235,6 +277,11 @@ def add_edge(self, u, v, data=None):
         >>> G.add_edge(0, 1)
 
         Adding new edge with features
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
         >>> import torch as th
         >>> G.add_edge(0, 2, {'x': th.ones(1, 4)})
         >>> G.edges()
@@ -284,6 +331,11 @@ def add_edges(self, u, v, data=None):
         >>> G.add_edges([0, 2], [1, 3]) # add edges (0, 1) and (2, 3)
 
         Adding new edges with features
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
         >>> import torch as th
         >>> G.add_edges([1, 3], [2, 0], {'x': th.ones(2, 4)}) # (1, 2), (3, 0)
         >>> G.edata['x']
@@ -1356,56 +1408,133 @@ def pop_e_repr(self, key):
     def register_message_func(self, func):
         """Register global message function.
 
+        Once registered, ``func`` will be used as the default
+        message function in message passing operations, including
+        :func:`send`, :func:`send_and_recv`, :func:`pull`,
+        :func:`push`, :func:`update_all`.
+
         Parameters
         ----------
         func : callable
-          Message function on the edge.
+            Message function on the edge. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+
+        See Also
+        --------
+        send
+        send_and_recv
+        pull
+        push
+        update_all
         """
         self._message_func = func
 
     def register_reduce_func(self, func):
         """Register global message reduce function.
 
+        Once registered, ``func`` will be used as the default
+        message reduce function in message passing operations, including
+        :func:`recv`, :func:`send_and_recv`, :func:`push`, :func:`pull`,
+        :func:`update_all`.
+
         Parameters
         ----------
-        func : str or callable
-          Reduce function on incoming edges.
+        func : callable
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+
+        See Also
+        --------
+        recv
+        send_and_recv
+        push
+        pull
+        update_all
         """
         self._reduce_func = func
 
     def register_apply_node_func(self, func):
         """Register global node apply function.
 
+        Once registered, ``func`` will be used as the default apply
+        node function. Related operations include :func:`apply_nodes`,
+        :func:`recv`, :func:`send_and_recv`, :func:`push`, :func:`pull`,
+        :func:`update_all`.
+
         Parameters
         ----------
         func : callable
-            Apply function on the node.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+
+        See Also
+        --------
+        apply_nodes
+        register_apply_edge_func
         """
         self._apply_node_func = func
 
     def register_apply_edge_func(self, func):
         """Register global edge apply function.
 
+        Once registered, ``func`` will be used as the default apply
+        edge function in :func:`apply_edges`.
+
         Parameters
         ----------
-        edge_func : callable
-            Apply function on the edge.
+        func : callable
+            Apply function on the edge. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+
+        See Also
+        --------
+        apply_edges
+        register_apply_node_func
         """
         self._apply_edge_func = func
 
     def apply_nodes(self, func="default", v=ALL, inplace=False):
-        """Apply the function on the node features.
+        """Apply the function on the nodes to update their features.
 
-        Applying a None function will be ignored.
+        If None is provided for ``func``, nothing will happen.
 
         Parameters
         ----------
-        func : callable, optional
-            The UDF applied on the node features.
+        func : callable or None, optional
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         v : int, iterable of int, tensor, optional
-            The node id(s).
-        inplace: bool, optional
+            The node (ids) on which to apply ``func``. The default
+            value is all the nodes.
+        inplace : bool, optional
             If True, update will be done in place, but autograd will break.
+
+        Examples
+        --------
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        >>> import dgl
+        >>> import torch as th
+
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.ones(3, 1)
+
+        >>> # Increment the node feature by 1.
+        >>> def increment_feature(nodes): return {'x': nodes.data['x'] + 1}
+        >>> g.apply_nodes(func=increment_feature, v=[0, 2]) # Apply func to nodes 0, 2
+        >>> g.ndata
+        {'x': tensor([[2.],
+                      [1.],
+                      [2.]])}
+
+        See Also
+        --------
+        register_apply_node_func
+        apply_edges
         """
         if func == "default":
             func = self._apply_node_func
@@ -1421,22 +1550,51 @@ def apply_nodes(self, func="default", v=ALL, inplace=False):
             Runtime.run(prog)
 
     def apply_edges(self, func="default", edges=ALL, inplace=False):
-        """Apply the function on the edge features.
+        """Apply the function on the edges to update their features.
+
+        If None is provided for ``func``, nothing will happen.
 
         Parameters
         ----------
         func : callable, optional
-            The UDF applied on the edge features.
-        edges : edges, optional
-            Edges can be a pair of endpoint nodes (u, v), or a
-            tensor of edge ids. The default value is all the edges.
+            Apply function on the edge. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+        edges : valid edges type, optional
+            Edges on which to apply ``func``. See :func:`send` for valid
+            edges type. Default is all the edges.
         inplace: bool, optional
             If True, update will be done in place, but autograd will break.
 
         Notes
         -----
-        On multigraphs, if u and v are specified, then all the edges
-        between u and v will be updated.
+        On multigraphs, if :math:`u` and :math:`v` are specified, then all the edges
+        between :math:`u` and :math:`v` will be updated.
+
+        Examples
+        --------
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        >>> import dgl
+        >>> import torch as th
+
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.add_edges([0, 1], [1, 2])   # 0 -> 1, 1 -> 2
+        >>> g.edata['y'] = th.ones(2, 1)
+
+        >>> # Doubles the edge feature.
+        >>> def double_feature(edges): return {'y': edges.data['y'] * 2}
+        >>> g.apply_edges(func=double_feature, edges=0) # Apply func to the first edge.
+        >>> g.edata
+        {'y': tensor([[2.],   # 2 * 1
+                      [1.]])}
+
+        See Also
+        --------
+        apply_nodes
         """
         if func == "default":
             func = self._apply_edge_func
@@ -1464,21 +1622,41 @@ def apply_edges(self, func="default", edges=ALL, inplace=False):
                                            inplace=inplace)
             Runtime.run(prog)
 
-    def send(self, edges, message_func="default"):
+    def send(self, edges=ALL, message_func="default"):
         """Send messages along the given edges.
 
+        ``edges`` can be any of the following types:
+
+        * ``int`` : Specify one edge using its edge id.
+        * ``pair of int`` : Specify one edge using its endpoints.
+        * ``int iterable`` / ``tensor`` : Specify multiple edges using their edge ids.
+        * ``pair of int iterable`` / ``pair of tensors`` :
+          Specify multiple edges using their endpoints.
+
+        The UDF returns messages on the edges and can be later fetched in
+        the destination node's ``mailbox``. Receiving will consume the messages.
+        See :func:`recv` for example.
+
+        If multiple ``send`` are triggered on the same edge without ``recv``. Messages
+        generated by the later ``send`` will overwrite previous messages.
+
         Parameters
         ----------
-        edges : edges, optional
-            Edges can be a pair of endpoint nodes (u, v), or a
-            tensor of edge ids.
+        edges : valid edges type, optional
+            Edges on which to apply ``message_func``. Default is sending along all
+            the edges.
         message_func : callable
-            The message function.
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
 
         Notes
         -----
-        On multigraphs, if u and v are specified, then the messages will be sent
-        along all edges between u and v.
+        On multigraphs, if :math:`u` and :math:`v` are specified, then the messages will be sent
+        along all edges between :math:`u` and :math:`v`.
+
+        Examples
+        --------
+        See the *message passing* example in :class:`DGLGraph` or :func:`recv`.
         """
         if message_func == "default":
             message_func = self._message_func
@@ -1505,26 +1683,84 @@ def send(self, edges, message_func="default"):
         self._msg_graph.add_edges(u, v)
 
     def recv(self,
-             v,
+             v=ALL,
              reduce_func="default",
              apply_node_func="default",
              inplace=False):
-        """Receive and reduce in-coming messages and update representation on node v.
+        """Receive and reduce incoming messages and update the features of node(s) :math:`v`.
+
+        Optionally, apply a function to update the node features after receive.
 
-        TODO(minjie): document on zero-in-degree case
-        TODO(minjie): document on how returned new features are merged with the old features
-        TODO(minjie): document on how many times UDFs will be called
+        * `reduce_func` will be skipped for nodes with no incoming message.
+        * If all ``v`` have no incoming message, this will downgrade to an :func:`apply_nodes`.
+        * If some ``v`` have no incoming message, their new feature value will be calculated
+          by the column initializer (see :func:`set_n_initializer`). The feature shapes and
+          dtypes will be inferred.
+
+        The node features will be updated by the result of the ``reduce_func``.
+
+        Messages are consumed once received.
+
+        The provided UDF maybe called multiple times so it is recommended to provide
+        function with no side effect.
 
         Parameters
         ----------
-        v : node, container or tensor
-          The node to be updated.
-        reduce_func : callable
-          The reduce function.
-        apply_node_func : callable, optional
-          The update function.
+        v : node, container or tensor, optional
+            The node to be updated. Default is receiving all the nodes.
+        reduce_func : callable, optional
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+        apply_node_func : callable
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         inplace: bool, optional
-          If True, update will be done in place, but autograd will break.
+            If True, update will be done in place, but autograd will break.
+
+        Examples
+        --------
+        Create a graph object for demo.
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[1.], [2.], [3.]])
+        >>> g.add_edges([0, 1], [1, 2])
+
+        >>> # Define the function for sending node features as messages.
+        >>> def send_source(edges): return {'m': edges.src['x']}
+        >>> # Set the function defined to be the default message function.
+        >>> g.register_message_func(send_source)
+
+        >>> # Sum the messages received and use this to replace the original node feature.
+        >>> def simple_reduce(nodes): return {'x': nodes.mailbox['m'].sum(1)}
+        >>> # Set the function defined to be the default message reduce function.
+        >>> g.register_reduce_func(simple_reduce)
+
+        Send and receive messages. Note that although node :math:`0` has no incoming edges,
+        its feature gets changed from :math:`1` to :math:`0` as it is also included in
+        ``g.nodes()``.
+
+        >>> g.send(g.edges())
+        >>> g.recv(g.nodes())
+        >>> g.ndata['x']
+        tensor([[0.],
+                [1.],
+                [2.]])
+
+        Once messages are received, one will need another call of :func:`send` again before
+        another call of :func:`recv`. Otherwise, nothing will happen.
+
+        >>> g.recv(g.nodes())
+        >>> g.ndata['x']
+        tensor([[0.],
+                [1.],
+                [2.]])
         """
         if reduce_func == "default":
             reduce_func = self._reduce_func
@@ -1536,8 +1772,7 @@ def recv(self,
             # no message has ever been sent
             return
 
-        v_is_all = is_all(v)
-        if v_is_all:
+        if is_all(v):
             v = F.arange(0, self.number_of_nodes())
         elif isinstance(v, int):
             v = [v]
@@ -1563,29 +1798,76 @@ def send_and_recv(self,
                       reduce_func="default",
                       apply_node_func="default",
                       inplace=False):
-        """Send messages along edges and receive them on the targets.
+        """Send messages along edges and let destinations receive them.
+
+        Optionally, apply a function to update the node features after receive.
+
+        This is a convenient combination for performing
+        ``send(self, self.edges, message_func)`` and
+        ``recv(self, dst, reduce_func, apply_node_func)``, where ``dst``
+        are the destinations of the ``edges``.
 
         Parameters
         ----------
-        edges : edges
-            Edges can be a pair of endpoint nodes (u, v), or a
-            tensor of edge ids. The default value is all the edges.
+        edges : valid edges type
+            Edges on which to apply ``func``. See :func:`send` for valid
+            edges type.
         message_func : callable, optional
-            The message function. Registered function will be used if not
-            specified.
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
         reduce_func : callable, optional
-            The reduce function. Registered function will be used if not
-            specified.
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         apply_node_func : callable, optional
-            The update function. Registered function will be used if not
-            specified.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         inplace: bool, optional
             If True, update will be done in place, but autograd will break.
 
+        Examples
+        --------
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[1.], [2.], [3.]])
+        >>> g.add_edges([0, 1], [1, 2])
+
+        >>> # Define the function for sending node features as messages.
+        >>> def send_source(edges): return {'m': edges.src['x']}
+        >>> # Set the function defined to be the default message function.
+        >>> g.register_message_func(send_source)
+
+        >>> # Sum the messages received and use this to replace the original node feature.
+        >>> def simple_reduce(nodes): return {'x': nodes.mailbox['m'].sum(1)}
+        >>> # Set the function defined to be the default message reduce function.
+        >>> g.register_reduce_func(simple_reduce)
+
+        Send and receive messages.
+
+        >>> g.send_and_recv(g.edges())
+        >>> g.ndata['x']
+        tensor([[1.],
+                [1.],
+                [2.]])
+
+        Note that the feature of node :math:`0` remains the same as it has no
+        incoming edges.
+
         Notes
         -----
         On multigraphs, if u and v are specified, then the messages will be sent
         and received along all edges between u and v.
+
+        See Also
+        --------
+        send
+        recv
         """
         if message_func == "default":
             message_func = self._message_func
@@ -1626,20 +1908,81 @@ def pull(self,
              reduce_func="default",
              apply_node_func="default",
              inplace=False):
-        """Pull messages from the node's predecessors and then update it.
+        """Pull messages from the node(s)' predecessors and then update their features.
+
+        Optionally, apply a function to update the node features after receive.
+
+        * `reduce_func` will be skipped for nodes with no incoming message.
+        * If all ``v`` have no incoming message, this will downgrade to an :func:`apply_nodes`.
+        * If some ``v`` have no incoming message, their new feature value will be calculated
+          by the column initializer (see :func:`set_n_initializer`). The feature shapes and
+          dtypes will be inferred.
 
         Parameters
         ----------
-        v : node, container or tensor
-          The node to be updated.
-        message_func : callable
-          The message function.
-        reduce_func : callable
-          The reduce function.
+        v : int, iterable of int, or tensor
+            The node(s) to be updated.
+        message_func : callable, optional
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+        reduce_func : callable, optional
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         apply_node_func : callable, optional
-          The update function.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         inplace: bool, optional
-          If True, update will be done in place, but autograd will break.
+            If True, update will be done in place, but autograd will break.
+
+        Examples
+        --------
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        Create a graph for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[0.], [1.], [2.]])
+
+        Use the built-in message function :func:`~dgl.function.copy_src` for copying
+        node features as the message.
+
+        >>> m_func = dgl.function.copy_src('x', 'm')
+        >>> g.register_message_func(m_func)
+
+        Use the built-int message reducing function :func:`~dgl.function.sum`, which
+        sums the messages received and replace the old node features with it.
+
+        >>> m_reduce_func = dgl.function.sum('m', 'x')
+        >>> g.register_reduce_func(m_reduce_func)
+
+        As no edges exist, nothing happens.
+
+        >>> g.pull(g.nodes())
+        >>> g.ndata['x']
+        tensor([[0.],
+                [1.],
+                [2.]])
+
+        Add edges ``0 -> 1, 1 -> 2``. Pull messages for the node :math:`2`.
+
+        >>> g.add_edges([0, 1], [1, 2])
+        >>> g.pull(2)
+        >>> g.ndata['x']
+        tensor([[0.],
+                [1.],
+                [1.]])
+
+        The feature of node :math:`2` changes but the feature of node :math:`1`
+        remains the same as we did not :func:`pull` (and reduce) messages for it.
+
+        See Also
+        --------
+        push
         """
         if message_func == "default":
             message_func = self._message_func
@@ -1669,20 +2012,75 @@ def push(self,
              reduce_func="default",
              apply_node_func="default",
              inplace=False):
-        """Send message from the node to its successors and update them.
+        """Send message from the node(s) to their successors and update them.
+
+        Optionally, apply a function to update the node features after receive.
 
         Parameters
         ----------
-        u : node, container or tensor
-          The node that sends out messages.
-        message_func : callable
-          The message function.
-        reduce_func : callable
-          The reduce function.
-        apply_node_func : callable
-          The update function.
+        u : int, iterable of int, or tensor
+            The node(s) to push messages out.
+        message_func : callable, optional
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+        reduce_func : callable, optional
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+        apply_node_func : callable, optional
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         inplace: bool, optional
-          If True, update will be done in place, but autograd will break.
+            If True, update will be done in place, but autograd will break.
+
+        Examples
+        --------
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        Create a graph for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[1.], [2.], [3.]])
+
+        Use the built-in message function :func:`~dgl.function.copy_src` for copying
+        node features as the message.
+
+        >>> m_func = dgl.function.copy_src('x', 'm')
+        >>> g.register_message_func(m_func)
+
+        Use the built-int message reducing function :func:`~dgl.function.sum`, which
+        sums the messages received and replace the old node features with it.
+
+        >>> m_reduce_func = dgl.function.sum('m', 'x')
+        >>> g.register_reduce_func(m_reduce_func)
+
+        As no edges exist, nothing happens.
+
+        >>> g.push(g.nodes())
+        >>> g.ndata['x']
+        tensor([[1.],
+                [2.],
+                [3.]])
+
+        Add edges ``0 -> 1, 1 -> 2``. Send messages from the node :math:`1`. and update.
+
+        >>> g.add_edges([0, 1], [1, 2])
+        >>> g.push(1)
+        >>> g.ndata['x']
+        tensor([[1.],
+                [2.],
+                [2.]])
+
+        The feature of node :math:`2` changes but the feature of node :math:`1`
+        remains the same as we did not :func:`push` for node :math:`0`.
+
+        See Also
+        --------
+        pull
         """
         if message_func == "default":
             message_func = self._message_func
@@ -1710,16 +2108,30 @@ def update_all(self,
                    message_func="default",
                    reduce_func="default",
                    apply_node_func="default"):
-        """Send messages through all the edges and update all nodes.
+        """Send messages through all edges and update all nodes.
+
+        Optionally, apply a function to update the node features after receive.
+
+        This is a convenient combination for performing
+        ``send(self, self.edges(), message_func)`` and
+        ``recv(self, self.nodes(), reduce_func, apply_node_func)``.
 
         Parameters
         ----------
-        message_func : callable
-          The message function.
-        reduce_func : callable
-          The reduce function.
+        message_func : callable, optional
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
+        reduce_func : callable, optional
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         apply_node_func : callable, optional
-          The update function.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+
+        See Also
+        --------
+        send
+        recv
         """
         if message_func == "default":
             message_func = self._message_func
@@ -1742,7 +2154,8 @@ def prop_nodes(self,
                    message_func="default",
                    reduce_func="default",
                    apply_node_func="default"):
-        """Propagate messages using graph traversal by triggering pull() on nodes.
+        """Propagate messages using graph traversal by triggering
+        :func:`pull()` on nodes.
 
         The traversal order is specified by the ``nodes_generator``. It generates
         node frontiers, which is a list or a tensor of nodes. The nodes in the
@@ -1751,14 +2164,64 @@ def prop_nodes(self,
 
         Parameters
         ----------
-        node_generators : generator
-            The generator of node frontiers.
+        node_generators : iterable, each element is a list or a tensor of node ids
+            The generator of node frontiers. It specifies which nodes perform
+            :func:`pull` at each timestep.
         message_func : callable, optional
-            The message function.
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
         reduce_func : callable, optional
-            The reduce function.
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         apply_node_func : callable, optional
-            The update function.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+
+        Examples
+        --------
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        Create a graph for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(4)
+        >>> g.ndata['x'] = th.tensor([[1.], [2.], [3.], [4.]])
+        >>> g.add_edges([0, 1, 1, 2], [1, 2, 3, 3])
+
+        Prepare message function and message reduce function for demo.
+
+        >>> def send_source(edges): return {'m': edges.src['x']}
+        >>> g.register_message_func(send_source)
+        >>> def simple_reduce(nodes): return {'x': nodes.mailbox['m'].sum(1)}
+        >>> g.register_reduce_func(simple_reduce)
+
+        First pull messages for nodes :math:`1, 2` with edges ``0 -> 1`` and
+        ``1 -> 2``; and then pull messages for node :math:`3` with edges
+        ``1 -> 3`` and ``2 -> 3``.
+
+        >>> g.prop_nodes([[1, 2], [3]])
+        >>> g.ndata['x']
+        tensor([[1.],
+                [1.],
+                [2.],
+                [3.]])
+
+        In the first stage, we pull messages for nodes :math:`1, 2`.
+        The feature of node :math:`1` is replaced by that of node :math:`0`, i.e. 1
+        The feature of node :math:`2` is replaced by that of node :math:`1`, i.e. 2.
+        Both of the replacement happen simultaneously.
+
+        In the second stage, we pull messages for node :math:`3`.
+        The feature of node :math:`3` becomes the sum of node :math:`1`'s feature and
+        :math:`2`'s feature, i.e. 1 + 2 = 3.
+
+        See Also
+        --------
+        prop_edges
         """
         for node_frontier in nodes_generator:
             self.pull(node_frontier,
@@ -1769,24 +2232,79 @@ def prop_edges(self,
                    message_func="default",
                    reduce_func="default",
                    apply_node_func="default"):
-        """Propagate messages using graph traversal by triggering send_and_recv() on edges.
+        """Propagate messages using graph traversal by triggering
+        :func:`send_and_recv()` on edges.
+
+        The traversal order is specified by the ``edges_generator``. It generates
+        edge frontiers. The edge frontiers should be of *valid edges type*.
+        See :func:`send` for more details.
 
-        The traversal order is specified by the ``edges_generator``. It
-        generates edge frontiers, which is a list or a tensor of edge ids or
-        end points.  The edges in the same frontier will be triggered together,
-        while edges in different frontiers will be triggered according to the
-        generating order.
+        Edges in the same frontier will be triggered together, while edges in
+        different frontiers will be triggered according to the generating order.
 
         Parameters
         ----------
         edges_generator : generator
             The generator of edge frontiers.
         message_func : callable, optional
-            The message function.
+            Message function on the edges. The function should be
+            an :mod:`Edge UDF <dgl.udf>`.
         reduce_func : callable, optional
-            The reduce function.
+            Reduce function on the node. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
         apply_node_func : callable, optional
-            The update function.
+            Apply function on the nodes. The function should be
+            a :mod:`Node UDF <dgl.udf>`.
+
+        Examples
+        --------
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        Create a graph for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(4)
+        >>> g.ndata['x'] = th.tensor([[1.], [2.], [3.], [4.]])
+        >>> g.add_edges([0, 1, 1, 2], [1, 2, 3, 3])
+
+        Prepare message function and message reduce function for demo.
+
+        >>> def send_source(edges): return {'m': edges.src['x']}
+        >>> g.register_message_func(send_source)
+        >>> def simple_reduce(nodes): return {'x': nodes.mailbox['m'].sum(1)}
+        >>> g.register_reduce_func(simple_reduce)
+
+        First propagate messages for edges ``0 -> 1``, ``1 -> 3`` and then
+        propagate messages for edges ``1 -> 2``, ``2 -> 3``.
+
+        >>> g.prop_edges([([0, 1], [1, 3]), ([1, 2], [2, 3])])
+        >>> g.ndata['x']
+        tensor([[1.],
+                [1.],
+                [1.],
+                [3.]])
+
+        In the first stage, the following happens simultaneously.
+
+            - The feature of node :math:`1` is replaced by that of
+              node :math:`0`, i.e. 1.
+            - The feature of node :math:`3` is replaced by that of
+              node :math:`1`, i.e. 2.
+
+        In the second stage, the following happens simultaneously.
+
+            - The feature of node :math:`2` is replaced by that of
+              node :math:`1`, i.e. 1.
+            - The feature of node :math:`3` is replaced by that of
+              node :math:`2`, i.e. 3.
+
+        See Also
+        --------
+        prop_nodes
         """
         for edge_frontier in edges_generator:
             self.send_and_recv(edge_frontier,
@@ -1843,7 +2361,9 @@ def subgraphs(self, nodes):
         nodes in the list.
 
         Equivalent to
+        ``
         [self.subgraph(nodes_list) for nodes_list in nodes]
+        ``
 
         Parameters
         ----------
@@ -1943,27 +2463,29 @@ def incidence_matrix(self, type, ctx=F.cpu()):
         or not.
 
         There are three types of an incidence matrix :math:`I`:
-
-        * "in":
-
-          - :math:`I[v, e] = 1` if e is the in-edge of v (or v is the dst node of e);
-          - :math:`I[v, e] = 0` otherwise.
-
-        * "out":
-
-          - :math:`I[v, e] = 1` if e is the out-edge of v (or v is the src node of e);
-          - :math:`I[v, e] = 0` otherwise.
-
-        * "both":
-
-          - :math:`I[v, e] = 1` if e is the in-edge of v;
-          - :math:`I[v, e] = -1` if e is the out-edge of v;
-          - :math:`I[v, e] = 0` otherwise (including self-loop).
+        
+        * ``in``:
+        
+            - :math:`I[v, e] = 1` if :math:`e` is the in-edge of :math:`v`
+              (or :math:`v` is the dst node of :math:`e`);
+            - :math:`I[v, e] = 0` otherwise.
+            
+        * ``out``:
+        
+            - :math:`I[v, e] = 1` if :math:`e` is the out-edge of :math:`v`
+              (or :math:`v` is the src node of :math:`e`);
+            - :math:`I[v, e] = 0` otherwise.
+            
+        * ``both``:
+        
+            - :math:`I[v, e] = 1` if :math:`e` is the in-edge of :math:`v`;
+            - :math:`I[v, e] = -1` if :math:`e` is the out-edge of :math:`v`;
+            - :math:`I[v, e] = 0` otherwise (including self-loop).
 
         Parameters
         ----------
         type : str
-            Can be either "in", "out" or "both"
+            Can be either ``in``, ``out`` or ``both``
         ctx : context, optional (default=cpu)
             The context of returned incidence matrix.
 
@@ -2000,16 +2522,45 @@ def filter_nodes(self, predicate, nodes=ALL):
         Parameters
         ----------
         predicate : callable
-            The predicate should take in a NodeBatch object, and return a
-            boolean tensor with N elements indicating which node satisfy
-            the predicate.
-        nodes : container or tensor
-            The nodes to filter on
+            A function of signature ``func(nodes) -> tensor``.
+            ``nodes`` are :class:`NodeBatch` objects as in :mod:`~dgl.udf`.
+            The ``tensor`` returned should be a 1-D boolean tensor with
+            each element indicating whether the corresponding node in
+            the batch satisfies the predicate.
+        nodes : int, iterable or tensor of ints
+            The nodes to filter on. Default value is all the nodes.
 
         Returns
         -------
         tensor
-            The filtered nodes
+            The filtered nodes.
+
+        Examples
+        --------
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+
+        Construct a graph object for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[1.], [-1.], [1.]])
+
+        Define a function for filtering nodes with feature :math:`1`.
+
+        >>> def has_feature_one(nodes): return (nodes.data['x'] == 1).squeeze(1)
+
+        Filter the nodes with feature :math:`1`.
+
+        >>> g.filter_nodes(has_feature_one)
+        tensor([0, 2])
+
+        See Also
+        --------
+        filter_edges
         """
         if is_all(nodes):
             v = utils.toindex(slice(0, self.number_of_nodes()))
@@ -2032,17 +2583,48 @@ def filter_edges(self, predicate, edges=ALL):
         Parameters
         ----------
         predicate : callable
-            The predicate should take in an EdgeBatch object, and return a
-            boolean tensor with E elements indicating which edge satisfy
-            the predicate.
-        edges : edges
-            Edges can be a pair of endpoint nodes (u, v), or a
-            tensor of edge ids. The default value is all the edges.
+            A function of signature ``func(edges) -> tensor``.
+            ``edges`` are :class:`EdgeBatch` objects as in :mod:`~dgl.udf`.
+            The ``tensor`` returned should be a 1-D boolean tensor with
+            each element indicating whether the corresponding edge in
+            the batch satisfies the predicate.
+        edges : valid edges type
+            Edges on which to apply ``func``. See :func:`send` for valid
+            edges type. Default value is all the edges.
 
         Returns
         -------
         tensor
-            The filtered edges
+            The filtered edges represented by their ids.
+
+        Examples
+        --------
+
+        .. note:: Here we use pytorch syntax for demo. The general idea applies
+            to other frameworks with minor syntax change (e.g. replace
+            ``torch.tensor`` with ``mxnet.ndarray``).
+            Construct a graph object for demo.
+
+        >>> import dgl
+        >>> import torch as th
+        >>> g = dgl.DGLGraph()
+        >>> g.add_nodes(3)
+        >>> g.ndata['x'] = th.tensor([[1.], [-1.], [1.]])
+        >>> g.add_edges([0, 1, 2], [2, 2, 1])
+
+        Define a function for filtering edges whose destinations have
+        node feature :math:`1`.
+
+        >>> def has_dst_one(edges): return (edges.dst['x'] == 1).squeeze(1)
+
+        Filter the edges whose destination nodes have feature :math:`1`.
+
+        >>> g.filter_edges(has_dst_one)
+        tensor([0, 1])
+
+        See Also
+        --------
+        filter_nodes
         """
         if is_all(edges):
             eid = ALL