diff --git a/examples/pytorch/graphsage/load_graph.py b/examples/pytorch/graphsage/load_graph.py
index 742c25146809..e090420a65c4 100644
--- a/examples/pytorch/graphsage/load_graph.py
+++ b/examples/pytorch/graphsage/load_graph.py
@@ -23,7 +23,9 @@ def load_reddit():
 def load_ogb(name):
     from ogb.nodeproppred import DglNodePropPredDataset
 
+    print('load', name)
     data = DglNodePropPredDataset(name=name)
+    print('finish loading', name)
     splitted_idx = data.get_idx_split()
     graph, labels = data[0]
     labels = labels[:, 0]
@@ -31,7 +33,7 @@ def load_ogb(name):
     graph.ndata['features'] = graph.ndata['feat']
     graph.ndata['labels'] = labels
     in_feats = graph.ndata['features'].shape[1]
-    num_labels = len(th.unique(labels))
+    num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
 
     # Find the node IDs in the training, validation, and test set.
     train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
@@ -44,7 +46,8 @@ def load_ogb(name):
     graph.ndata['train_mask'] = train_mask
     graph.ndata['val_mask'] = val_mask
     graph.ndata['test_mask'] = test_mask
-    return graph, len(th.unique(graph.ndata['labels']))
+    print('finish constructing', name)
+    return graph, num_labels
 
 def inductive_split(g):
     """Split the graph into training graph, validation graph, and test graph by training
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index a660a821e6dc..a724f266586e 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -79,6 +79,7 @@
 
 import json
 import os
+import time
 import numpy as np
 
 from .. import backend as F
@@ -274,6 +275,7 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
 
     # Let's calculate edge assignment.
     # TODO(zhengda) we should replace int64 with int16. int16 should be sufficient.
+    start = time.time()
     if not reshuffle:
         edge_parts = np.zeros((g.number_of_edges(),), dtype=np.int64) - 1
     num_edges = 0
@@ -294,6 +296,7 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
         ledges_list.append(local_edges)
     assert num_edges == g.number_of_edges()
     assert num_nodes == g.number_of_nodes()
+    print('Calculate edge assignment: {:.3f} seconds'.format(time.time() - start))
 
     os.makedirs(out_path, mode=0o775, exist_ok=True)
     tot_num_inner_edges = 0
@@ -317,6 +320,7 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
             node_map_val = [g.number_of_nodes()]
             edge_map_val = [g.number_of_edges()]
 
+    start = time.time()
     part_metadata = {'graph_name': graph_name,
                      'num_nodes': g.number_of_nodes(),
                      'num_edges': g.number_of_edges(),
@@ -363,6 +367,7 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
 
     with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile:
         json.dump(part_metadata, outfile, sort_keys=True, indent=4)
+    print('Save partitions: {:.3f} seconds'.format(time.time() - start))
 
     num_cuts = g.number_of_edges() - tot_num_inner_edges
     if num_parts == 1:
diff --git a/python/dgl/transform.py b/python/dgl/transform.py
index 5d9fda8a73f8..976c06bbbe36 100644
--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
@@ -2,8 +2,10 @@
 
 from collections.abc import Iterable, Mapping
 from collections import defaultdict
+import time
 import numpy as np
 from scipy import sparse
+
 from ._ffi.function import _init_api
 from .graph import DGLGraph
 from .heterograph import DGLHeteroGraph
@@ -949,6 +951,7 @@ def partition_graph_with_halo(g, node_part, extra_cached_hops, reshuffle=False):
     assert len(node_part) == g.number_of_nodes()
     node_part = utils.toindex(node_part)
     if reshuffle:
+        start = time.time()
         node_part = node_part.tousertensor()
         sorted_part, new2old_map = F.sort_1d(node_part)
         new_node_ids = np.zeros((g.number_of_nodes(),), dtype=np.int64)
@@ -960,10 +963,14 @@ def partition_graph_with_halo(g, node_part, extra_cached_hops, reshuffle=False):
         orig_eids = _CAPI_DGLReassignEdges(g._graph, True)
         orig_eids = utils.toindex(orig_eids)
         g.edata['orig_id'] = orig_eids.tousertensor()
+        print('Reshuffle nodes and edges: {:.3f} seconds'.format(time.time() - start))
 
+    start = time.time()
     subgs = _CAPI_DGLPartitionWithHalo(g._graph, node_part.todgltensor(), extra_cached_hops)
+    print('Split the graph: {:.3f} seconds'.format(time.time() - start))
     subg_dict = {}
     node_part = node_part.tousertensor()
+    start = time.time()
     for i, subg in enumerate(subgs):
         inner_node = _get_halo_subgraph_inner_node(subg)
         subg = g._create_subgraph(subg, subg.induced_nodes, subg.induced_edges)
@@ -986,6 +993,7 @@ def partition_graph_with_halo(g, node_part, extra_cached_hops, reshuffle=False):
             inner_edge = F.ones((subg.number_of_edges(),), F.int64, F.cpu())
         subg.edata['inner_edge'] = inner_edge
         subg_dict[i] = subg
+    print('Construct subgraphs: {:.3f} seconds'.format(time.time() - start))
     return subg_dict
 
 def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False):
@@ -1021,7 +1029,9 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False):
     '''
     # METIS works only on symmetric graphs.
     # The METIS runs on the symmetric graph to generate the node assignment to partitions.
+    start = time.time()
     sym_g = to_bidirected_stale(g, readonly=True)
+    print('Convert a graph into a bidirected graph: {:.3f} seconds'.format(time.time() - start))
     vwgt = []
     # To balance the node types in each partition, we can take advantage of the vertex weights
     # in Metis. When vertex weights are provided, Metis will tries to generate partitions with
@@ -1033,6 +1043,7 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False):
     # if a node belongs to the first node type, its weight is set to 1; otherwise, 0.
     # Similary, we set the second weight for the second node type and so on. The number
     # of weights is the same as the number of node types.
+    start = time.time()
     if balance_ntypes is not None:
         assert len(balance_ntypes) == g.number_of_nodes(), \
                 "The length of balance_ntypes should be equal to #nodes in the graph"
@@ -1051,11 +1062,14 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False):
         shape = (np.prod(F.shape(vwgt),),)
         vwgt = F.reshape(vwgt, shape)
         vwgt = F.zerocopy_to_dgl_ndarray(vwgt)
+        print('Construct multi-constraint weights: {:.3f} seconds'.format(time.time() - start))
     else:
         vwgt = F.zeros((0,), F.int64, F.cpu())
         vwgt = F.zerocopy_to_dgl_ndarray(vwgt)
 
+    start = time.time()
     node_part = _CAPI_DGLMetisPartition(sym_g._graph, k, vwgt)
+    print('Metis partitioning: {:.3f} seconds'.format(time.time() - start))
     if len(node_part) == 0:
         return None
     else: