Updated documentation relative to the use of node attributes and rela…

…tive tests
tomlincr · Mar 21, 2023 · a430e85 · a430e85
1 parent 5d6b90f
commit a430e85
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 25 deletions.
diff --git a/karateclub/graph_embedding/gl2vec.py b/karateclub/graph_embedding/gl2vec.py
@@ -1,3 +1,4 @@
+from typing import Optional
 import numpy as np
 import networkx as nx
 from typing import List
@@ -14,10 +15,6 @@ class GL2Vec(Estimator):
     these features a document (graph) - feature co-occurrence matrix is decomposed in order
     to generate representations for the graphs.
 
-    The procedure assumes that nodes have no string feature present and the WL-hashing
-    defaults to the degree centrality. However, if a node feature with the key "feature"
-    is supported for the nodes the feature extraction happens based on the values of this key.
-
     Args:
         wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2.
         dimensions (int): Dimensionality of embedding. Default is 128.
@@ -27,6 +24,7 @@ class GL2Vec(Estimator):
         learning_rate (float): HogWild! learning rate. Default is 0.025.
         min_count (int): Minimal count of graph feature occurrences. Default is 5.
         seed (int): Random seed for the model. Default is 42.
+        erase_base_features: (bool): Whether to delete the base features.
     """
 
     def __init__(
@@ -79,7 +77,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]):
         graphs = [self._create_line_graph(graph) for graph in graphs]
         documents = [
             WeisfeilerLehmanHashing(
-                graph, self.wl_iterations, False, self.erase_base_features
+                graph=graph,
+                wl_iterations=self.wl_iterations,
+                use_node_attribute=None,
+                erase_base_features=self.erase_base_features,
             )
             for graph in graphs
         ]
@@ -125,7 +126,10 @@ def infer(self, graphs) -> np.array:
         graphs = [self._create_line_graph(graph) for graph in graphs]
         documents = [
             WeisfeilerLehmanHashing(
-                graph, self.wl_iterations, False, self.erase_base_features
+                graph=graph,
+                wl_iterations=self.wl_iterations,
+                use_node_attribute=None,
+                erase_base_features=self.erase_base_features,
             )
             for graph in graphs
         ]

diff --git a/karateclub/graph_embedding/graph2vec.py b/karateclub/graph_embedding/graph2vec.py
@@ -1,6 +1,6 @@
 import numpy as np
 import networkx as nx
-from typing import List
+from typing import List, Optional
 from karateclub.estimator import Estimator
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from karateclub.utils.treefeatures import WeisfeilerLehmanHashing
@@ -14,12 +14,12 @@ class Graph2Vec(Estimator):
     to generate representations for the graphs.
 
     The procedure assumes that nodes have no string feature present and the WL-hashing
-    defaults to the degree centrality. However, if a node feature with the key "feature"
-    is supported for the nodes the feature extraction happens based on the values of this key.
+    defaults to the degree centrality. However, if the parameter `use_node_attribute` is
+    provided, the feature extraction happens based on the values of this key.
 
     Args:
         wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2.
-        attributed (bool): Presence of graph attributes. Default is False.
+        use_node_attribute (Optional[str]): The optional parameter from which to load node features. Default is None..
         dimensions (int): Dimensionality of embedding. Default is 128.
         workers (int): Number of cores. Default is 4.
         down_sampling (float): Down sampling frequency. Default is 0.0001.
@@ -33,7 +33,7 @@ class Graph2Vec(Estimator):
     def __init__(
         self,
         wl_iterations: int = 2,
-        attributed: bool = False,
+        use_node_attribute: Optional[str] = None,
         dimensions: int = 128,
         workers: int = 4,
         down_sampling: float = 0.0001,
@@ -45,7 +45,7 @@ def __init__(
     ):
 
         self.wl_iterations = wl_iterations
-        self.attributed = attributed
+        self.use_node_attribute = use_node_attribute
         self.dimensions = dimensions
         self.workers = workers
         self.down_sampling = down_sampling
@@ -66,7 +66,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]):
         graphs = self._check_graphs(graphs)
         documents = [
             WeisfeilerLehmanHashing(
-                graph, self.wl_iterations, self.attributed, self.erase_base_features
+                graph=graph,
+                wl_iterations=self.wl_iterations,
+                use_node_attribute=self.use_node_attribute,
+                erase_base_features=self.erase_base_features,
             )
             for graph in graphs
         ]
@@ -111,7 +114,10 @@ def infer(self, graphs) -> np.array:
         graphs = self._check_graphs(graphs)
         documents = [
             WeisfeilerLehmanHashing(
-                graph, self.wl_iterations, self.attributed, self.erase_base_features
+                graph=graph,
+                wl_iterations=self.wl_iterations,
+                use_node_attribute=self.use_node_attribute,
+                erase_base_features=self.erase_base_features,
             )
             for graph in graphs
         ]

diff --git a/karateclub/node_embedding/structural/role2vec.py b/karateclub/node_embedding/structural/role2vec.py
@@ -126,7 +126,7 @@ def fit(self, graph: nx.classes.graph.Graph):
         hasher = WeisfeilerLehmanHashing(
             graph=graph,
             wl_iterations=self.wl_iterations,
-            attributed=False,
+            use_node_attribute=None,
             erase_base_features=self.erase_base_features,
         )
 

diff --git a/karateclub/utils/__init__.py b/karateclub/utils/__init__.py
@@ -1,3 +1,7 @@
 from .walker import RandomWalker
 from .diffuser import EulerianDiffuser
 from .treefeatures import WeisfeilerLehmanHashing
+
+__all__ = [
+    "RandomWalker", "EulerianDiffuser", "WeisfeilerLehmanHashing"
+]
diff --git a/karateclub/utils/treefeatures.py b/karateclub/utils/treefeatures.py
@@ -1,6 +1,7 @@
 import hashlib
 import networkx as nx
-from typing import List, Dict
+from tqdm.auto import tqdm
+from typing import List, Dict, Optional
 
 
 class WeisfeilerLehmanHashing(object):
@@ -10,23 +11,23 @@ class WeisfeilerLehmanHashing(object):
     Args:
         graph (NetworkX graph): NetworkX graph for which we do WL hashing.
         wl_iterations (int): Number of WL iterations.
-        attributed (bool): Presence of attributes.
+        use_node_attribute (Optional[str]): Optional attribute name to be used.
         erase_base_feature (bool): Deleting the base features.
     """
 
     def __init__(
         self,
         graph: nx.classes.graph.Graph,
         wl_iterations: int,
-        attributed: bool,
+        use_node_attribute: Optional[str],
         erase_base_features: bool,
     ):
         """
         Initialization method which also executes feature extraction.
         """
         self.wl_iterations = wl_iterations
         self.graph = graph
-        self.attributed = attributed
+        self.use_node_attribute = use_node_attribute
         self.erase_base_features = erase_base_features
         self._set_features()
         self._do_recursions()
@@ -35,13 +36,54 @@ def _set_features(self):
         """
         Creating the features.
         """
-        if self.attributed:
-            self.features = nx.get_node_attributes(self.graph, "feature")
+        if self.use_node_attribute is not None:
+            # We retrieve the features of the nodes with the attribute name
+            # `feature` and assign them into a dictionary with structure:
+            # {node_a_name: feature_of_node_a}
+            # Nodes without this feature will not appear in the dictionary.
+            features = nx.get_node_attributes(self.graph, self.use_node_attribute)
+
+            # We check whether all nodes have the requested feature
+            if len(features) != self.graph.number_of_nodes():
+                missing_nodes = []
+                # We find up to five missing nodes so to make
+                # a more informative error message.
+                for node in tqdm(
+                    self.graph.nodes,
+                    total=self.graph.number_of_nodes(),
+                    leave=False,
+                    dynamic_ncols=True,
+                    desc="Searching for missing nodes"
+                ):
+                    if node not in features:
+                        missing_nodes.append(node)
+                    if len(missing_nodes) > 5:
+                        break
+                raise ValueError(
+                    (
+                        "We expected for ALL graph nodes to have a node "
+                        "attribute name `{}` to be used as part of "
+                        "the requested embedding algorithm, but only {} "
+                        "out of {} nodes has the correct attribute. "
+                        "Consider checking for typos and missing values, "
+                        "and use some imputation technique as necessary. "
+                        "Some of the nodes without the requested attribute "
+                        "are: {}"
+                    ).format(
+                        self.use_node_attribute,
+                        len(features),
+                        self.graph.number_of_nodes(),
+                        missing_nodes
+                    )
+                )
+            # If so, we assign the feature set.
+            self.features = features
         else:
             self.features = {
                 node: self.graph.degree(node) for node in self.graph.nodes()
             }
-        self.extracted_features = {k: [str(v)] for k, v in self.features.items()}
+        self.extracted_features = {k: [str(v)]
+                                   for k, v in self.features.items()}
 
     def _erase_base_features(self):
         """
@@ -61,7 +103,8 @@ def _do_a_recursion(self):
         for node in self.graph.nodes():
             nebs = self.graph.neighbors(node)
             degs = [self.features[neb] for neb in nebs]
-            features = [str(self.features[node])] + sorted([str(deg) for deg in degs])
+            features = [str(self.features[node])] + \
+                sorted([str(deg) for deg in degs])
             features = "_".join(features)
             hash_object = hashlib.md5(features.encode())
             hashing = hash_object.hexdigest()

diff --git a/test/graph_embedding_test.py b/test/graph_embedding_test.py
@@ -1,5 +1,6 @@
 import numpy as np
 import networkx as nx
+import pytest
 from karateclub.graph_embedding import Graph2Vec, FGSD, GL2Vec, SF, IGE, LDP
 from karateclub.graph_embedding import NetLSD, GeoScattering, FeatherGraph
 
@@ -146,7 +147,7 @@ def test_graph2vec():
         nx.set_node_attributes(graph, {j: str(j) for j in range(75)}, "feature")
         new_graphs.append(graph)
 
-    model = Graph2Vec(attributed=True)
+    model = Graph2Vec(use_node_attribute="feature")
 
     model.fit(graphs)
     embedding = model.get_embedding()
@@ -161,6 +162,13 @@ def test_graph2vec():
     assert new_embedding.shape[1] == model.dimensions
     assert type(new_embedding) == np.ndarray
 
+    with pytest.raises(ValueError):
+        model = Graph2Vec(
+            use_node_attribute="missing_feature",
+        )
+
+        model.fit(new_graphs)
+
 
 def test_gl2vec():
     """
@@ -197,7 +205,6 @@ def test_gl2vec():
     assert type(new_embedding) == np.ndarray
 
 
-
 def test_ldp():
     """
     Test the LDP embedding.