[Feature] Make to_heterogeneous(to_homogeneous(hg)) return hg (dmlc#2958

) * make to_heterogeneous and to_homogeneous invertible * docstring * oops Co-authored-by: Jinjing Zhou <[email protected]>
skyve2012 · Jun 3, 2021 · 2df4a95 · 2df4a95
1 parent 6042627
commit 2df4a95
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 45 deletions.
diff --git a/python/dgl/convert.py b/python/dgl/convert.py
@@ -631,8 +631,10 @@ def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,
 
     Notes
     -----
-    The returned node and edge types may not necessarily be in the same order as
-    ``ntypes`` and ``etypes``.
+    * The returned node and edge types may not necessarily be in the same order as
+      ``ntypes`` and ``etypes``.
+    * Calling :func:`~dgl.to_homogeneous` then calling :func:`~dgl.to_heterogeneous` again
+      yields the same result.
 
     Examples
     --------
@@ -705,7 +707,7 @@ def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,
     # relabel nodes to per-type local IDs
     ntype_count = np.bincount(ntype_ids, minlength=num_ntypes)
     ntype_offset = np.insert(np.cumsum(ntype_count), 0, 0)
-    ntype_ids_sortidx = np.argsort(ntype_ids)
+    ntype_ids_sortidx = np.argsort(ntype_ids, kind='stable')
     ntype_local_ids = np.zeros_like(ntype_ids)
     node_groups = []
     for i in range(num_ntypes):
@@ -848,6 +850,8 @@ def to_homogeneous(G, ndata=None, edata=None, store_type=True, return_count=Fals
       to its memory efficiency.
     * The ``ntype_count`` and ``etype_count`` lists can help speed up some operations.
       See :class:`~dgl.nn.pytorch.conv.RelGraphConv` for such an example.
+    * Calling :func:`~dgl.to_homogeneous` then calling :func:`~dgl.to_heterogeneous` again
+      yields the same result.
 
     Examples
     --------

diff --git a/tests/compute/test_heterograph.py b/tests/compute/test_heterograph.py
@@ -10,6 +10,7 @@
 from dgl import DGLError
 import test_utils
 from test_utils import parametrize_dtype, get_cases
+from utils import assert_is_identical_hetero
 from scipy.sparse import rand
 
 def create_test_heterograph(idtype):
@@ -1111,6 +1112,14 @@ def test_to_homo2(idtype):
     for i, count in enumerate(etype_count):
         assert count == hg.num_edges(hg.canonical_etypes[i])
 
+@parametrize_dtype
+def test_invertible_conversion(idtype):
+    # Test whether to_homogeneous and to_heterogeneous are invertible
+    hg = create_test_heterograph(idtype)
+    g = dgl.to_homogeneous(hg)
+    hg2 = dgl.to_heterogeneous(g, hg.ntypes, hg.etypes)
+    assert_is_identical_hetero(hg, hg2, True)
+
 @parametrize_dtype
 def test_metagraph_reachable(idtype):
     g = create_test_heterograph(idtype)

diff --git a/tests/compute/test_pickle.py b/tests/compute/test_pickle.py
@@ -11,46 +11,7 @@
 import unittest, pytest
 import test_utils
 from test_utils import parametrize_dtype, get_cases
-
-def _assert_is_identical(g, g2):
-    assert g.is_readonly == g2.is_readonly
-    assert g.number_of_nodes() == g2.number_of_nodes()
-    src, dst = g.all_edges(order='eid')
-    src2, dst2 = g2.all_edges(order='eid')
-    assert F.array_equal(src, src2)
-    assert F.array_equal(dst, dst2)
-
-    assert len(g.ndata) == len(g2.ndata)
-    assert len(g.edata) == len(g2.edata)
-    for k in g.ndata:
-        assert F.allclose(g.ndata[k], g2.ndata[k])
-    for k in g.edata:
-        assert F.allclose(g.edata[k], g2.edata[k])
-
-def _assert_is_identical_hetero(g, g2):
-    assert g.is_readonly == g2.is_readonly
-    assert g.ntypes == g2.ntypes
-    assert g.canonical_etypes == g2.canonical_etypes
-
-    # check if two metagraphs are identical
-    for edges, features in g.metagraph().edges(keys=True).items():
-        assert g2.metagraph().edges(keys=True)[edges] == features
-
-    # check if node ID spaces and feature spaces are equal
-    for ntype in g.ntypes:
-        assert g.number_of_nodes(ntype) == g2.number_of_nodes(ntype)
-        assert len(g.nodes[ntype].data) == len(g2.nodes[ntype].data)
-        for k in g.nodes[ntype].data:
-            assert F.allclose(g.nodes[ntype].data[k], g2.nodes[ntype].data[k])
-
-    # check if edge ID spaces and feature spaces are equal
-    for etype in g.canonical_etypes:
-        src, dst = g.all_edges(etype=etype, order='eid')
-        src2, dst2 = g2.all_edges(etype=etype, order='eid')
-        assert F.array_equal(src, src2)
-        assert F.array_equal(dst, dst2)
-        for k in g.edges[etype].data:
-            assert F.allclose(g.edges[etype].data[k], g2.edges[etype].data[k])
+from utils import assert_is_identical, assert_is_identical_hetero
 
 def _assert_is_identical_nodeflow(nf1, nf2):
     assert nf1.is_readonly == nf2.is_readonly
@@ -74,13 +35,13 @@ def _assert_is_identical_nodeflow(nf1, nf2):
             assert F.allclose(nf1.blocks[i].data[k], nf2.blocks[i].data[k])
 
 def _assert_is_identical_batchedgraph(bg1, bg2):
-    _assert_is_identical(bg1, bg2)
+    assert_is_identical(bg1, bg2)
     assert bg1.batch_size == bg2.batch_size
     assert bg1.batch_num_nodes == bg2.batch_num_nodes
     assert bg1.batch_num_edges == bg2.batch_num_edges
 
 def _assert_is_identical_batchedhetero(bg1, bg2):
-    _assert_is_identical_hetero(bg1, bg2)
+    assert_is_identical_hetero(bg1, bg2)
     for ntype in bg1.ntypes:
         assert bg1.batch_num_nodes(ntype) == bg2.batch_num_nodes(ntype)
     for canonical_etype in bg1.canonical_etypes:

diff --git a/tests/compute/utils.py b/tests/compute/utils.py
@@ -1,5 +1,7 @@
 import pytest
 import backend as F
+import dgl
+from dgl.base import is_internal_column
 
 if F._default_context_str == 'cpu':
     parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
@@ -13,3 +15,58 @@ def check_fail(fn, *args, **kwargs):
         return False
     except:
         return True
+
+def assert_is_identical(g, g2):
+    assert g.is_readonly == g2.is_readonly
+    assert g.number_of_nodes() == g2.number_of_nodes()
+    src, dst = g.all_edges(order='eid')
+    src2, dst2 = g2.all_edges(order='eid')
+    assert F.array_equal(src, src2)
+    assert F.array_equal(dst, dst2)
+
+    assert len(g.ndata) == len(g2.ndata)
+    assert len(g.edata) == len(g2.edata)
+    for k in g.ndata:
+        assert F.allclose(g.ndata[k], g2.ndata[k])
+    for k in g.edata:
+        assert F.allclose(g.edata[k], g2.edata[k])
+
+def assert_is_identical_hetero(g, g2, ignore_internal_data=False):
+    assert g.is_readonly == g2.is_readonly
+    assert g.ntypes == g2.ntypes
+    assert g.canonical_etypes == g2.canonical_etypes
+
+    # check if two metagraphs are identical
+    for edges, features in g.metagraph().edges(keys=True).items():
+        assert g2.metagraph().edges(keys=True)[edges] == features
+
+    # check if node ID spaces and feature spaces are equal
+    for ntype in g.ntypes:
+        assert g.number_of_nodes(ntype) == g2.number_of_nodes(ntype)
+        if ignore_internal_data:
+            for k in list(g.nodes[ntype].data.keys()):
+                if is_internal_column(k):
+                    del g.nodes[ntype].data[k]
+            for k in list(g2.nodes[ntype].data.keys()):
+                if is_internal_column(k):
+                    del g2.nodes[ntype].data[k]
+        assert len(g.nodes[ntype].data) == len(g2.nodes[ntype].data)
+        for k in g.nodes[ntype].data:
+            assert F.allclose(g.nodes[ntype].data[k], g2.nodes[ntype].data[k])
+
+    # check if edge ID spaces and feature spaces are equal
+    for etype in g.canonical_etypes:
+        src, dst = g.all_edges(etype=etype, order='eid')
+        src2, dst2 = g2.all_edges(etype=etype, order='eid')
+        assert F.array_equal(src, src2)
+        assert F.array_equal(dst, dst2)
+        if ignore_internal_data:
+            for k in list(g.edges[etype].data.keys()):
+                if is_internal_column(k):
+                    del g.edges[etype].data[k]
+            for k in list(g2.edges[etype].data.keys()):
+                if is_internal_column(k):
+                    del g2.edges[etype].data[k]
+        assert len(g.edges[etype].data) == len(g2.edges[etype].data)
+        for k in g.edges[etype].data:
+            assert F.allclose(g.edges[etype].data[k], g2.edges[etype].data[k])