diff --git a/examples/pytorch/ogb/cluster-gat/main.py b/examples/pytorch/ogb/cluster-gat/main.py
index 52dbabba00b0..4c8dec642f2c 100644
--- a/examples/pytorch/ogb/cluster-gat/main.py
+++ b/examples/pytorch/ogb/cluster-gat/main.py
@@ -95,8 +95,6 @@ def inference(self, g, x, batch_size, device):
                     drop_last=False,
                     num_workers=args.num_workers)
 
-            layer.fc_src = layer.fc
-            layer.fc_dst = layer.fc
             for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                 block = blocks[0].to(device)
                 h = x[input_nodes].to(device)
diff --git a/python/dgl/heterograph_index.py b/python/dgl/heterograph_index.py
index e0610013090e..cbf60ae349e2 100644
--- a/python/dgl/heterograph_index.py
+++ b/python/dgl/heterograph_index.py
@@ -522,7 +522,6 @@ def out_edges(self, etype, v):
         eid = F.from_dgl_nd(edge_array(2))
         return src, dst, eid
 
-    @utils.cached_member(cache='_cache', prefix='edges')
     def edges(self, etype, order=None):
         """Return all the edges
 
@@ -821,7 +820,6 @@ def edge_subgraph(self, induced_edges, preserve_nodes):
         eids = [F.to_dgl_nd(edges) for edges in induced_edges]
         return _CAPI_DGLHeteroEdgeSubgraph(self, eids, preserve_nodes)
 
-    @utils.cached_member(cache='_cache', prefix='unitgraph')
     def get_unitgraph(self, etype, ctx):
         """Create a unitgraph graph from given edge type and copy to the given device
         context.
@@ -912,7 +910,6 @@ def create_format_(self):
         """Create all sparse matrices allowed for the graph."""
         return _CAPI_DGLHeteroCreateFormat(self)
 
-    @utils.cached_member(cache='_cache', prefix='reverse')
     def reverse(self):
         """Reverse the heterogeneous graph adjacency
 
diff --git a/python/dgl/nn/pytorch/conv/gatconv.py b/python/dgl/nn/pytorch/conv/gatconv.py
index 8b56edd207f4..b593320806cc 100644
--- a/python/dgl/nn/pytorch/conv/gatconv.py
+++ b/python/dgl/nn/pytorch/conv/gatconv.py
@@ -26,13 +26,8 @@ class GATConv(nn.Module):
 
     Parameters
     ----------
-    in_feats : int, or pair of ints
+    in_feats : int
         Input feature size.
-
-        If the layer is to be applied to a unidirectional bipartite graph, ``in_feats``
-        specifies the input feature size on both the source and destination nodes.  If
-        a scalar is given, the source and destination node feature size would take the
-        same value.
     out_feats : int
         Output feature size.
     num_heads : int
@@ -62,14 +57,8 @@ def __init__(self,
         self._num_heads = num_heads
         self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats)
         self._out_feats = out_feats
-        if isinstance(in_feats, tuple):
-            self.fc_src = nn.Linear(
-                self._in_src_feats, out_feats * num_heads, bias=False)
-            self.fc_dst = nn.Linear(
-                self._in_dst_feats, out_feats * num_heads, bias=False)
-        else:
-            self.fc = nn.Linear(
-                self._in_src_feats, out_feats * num_heads, bias=False)
+        self.fc = nn.Linear(
+            self._in_src_feats, out_feats * num_heads, bias=False)
         self.attn_l = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
         self.attn_r = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats)))
         self.feat_drop = nn.Dropout(feat_drop)
@@ -89,11 +78,7 @@ def __init__(self,
     def reset_parameters(self):
         """Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain('relu')
-        if hasattr(self, 'fc'):
-            nn.init.xavier_normal_(self.fc.weight, gain=gain)
-        else: # bipartite graph neural networks
-            nn.init.xavier_normal_(self.fc_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
+        nn.init.xavier_normal_(self.fc.weight, gain=gain)
         nn.init.xavier_normal_(self.attn_l, gain=gain)
         nn.init.xavier_normal_(self.attn_r, gain=gain)
         if isinstance(self.res_fc, nn.Linear):
@@ -122,8 +107,8 @@ def forward(self, graph, feat):
             if isinstance(feat, tuple):
                 h_src = self.feat_drop(feat[0])
                 h_dst = self.feat_drop(feat[1])
-                feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats)
-                feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats)
+                feat_src = self.fc(h_src).view(-1, self._num_heads, self._out_feats)
+                feat_dst = self.fc(h_dst).view(-1, self._num_heads, self._out_feats)
             else:
                 h_src = h_dst = self.feat_drop(feat)
                 feat_src = feat_dst = self.fc(h_src).view(
diff --git a/src/array/cuda/csr_sort.cu b/src/array/cuda/csr_sort.cu
index a2f4172916e2..b9aeaa72178d 100644
--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
@@ -140,6 +140,9 @@ void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr) {
   csr->sorted = true;
   csr->indices = new_indices;
   csr->data = new_data;
+
+  // free resources
+  device->FreeWorkspace(ctx, workspace);
 }
 
 template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
diff --git a/tests/pytorch/test_nn.py b/tests/pytorch/test_nn.py
index deee8b6a3f9c..d92dc2017e19 100644
--- a/tests/pytorch/test_nn.py
+++ b/tests/pytorch/test_nn.py
@@ -497,8 +497,8 @@ def test_gat_conv(g, idtype):
 def test_gat_conv_bi(g, idtype):
     g = g.astype(idtype).to(F.ctx())
     ctx = F.ctx()
-    gat = nn.GATConv((5, 10), 2, 4)
-    feat = (F.randn((g.number_of_src_nodes(), 5)), F.randn((g.number_of_dst_nodes(), 10)))
+    gat = nn.GATConv(5, 2, 4)
+    feat = (F.randn((g.number_of_src_nodes(), 5)), F.randn((g.number_of_dst_nodes(), 5)))
     gat = gat.to(ctx)
     h = gat(g, feat)
     assert h.shape == (g.number_of_dst_nodes(), 4, 2)