diff --git a/karateclub/graph_embedding/gl2vec.py b/karateclub/graph_embedding/gl2vec.py index ec5f9038..1ede4e6c 100644 --- a/karateclub/graph_embedding/gl2vec.py +++ b/karateclub/graph_embedding/gl2vec.py @@ -1,3 +1,4 @@ +from typing import Optional import numpy as np import networkx as nx from typing import List @@ -14,10 +15,6 @@ class GL2Vec(Estimator): these features a document (graph) - feature co-occurrence matrix is decomposed in order to generate representations for the graphs. - The procedure assumes that nodes have no string feature present and the WL-hashing - defaults to the degree centrality. However, if a node feature with the key "feature" - is supported for the nodes the feature extraction happens based on the values of this key. - Args: wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2. dimensions (int): Dimensionality of embedding. Default is 128. @@ -27,6 +24,7 @@ class GL2Vec(Estimator): learning_rate (float): HogWild! learning rate. Default is 0.025. min_count (int): Minimal count of graph feature occurrences. Default is 5. seed (int): Random seed for the model. Default is 42. + erase_base_features: (bool): Whether to delete the base features. """ def __init__( @@ -79,7 +77,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]): graphs = [self._create_line_graph(graph) for graph in graphs] documents = [ WeisfeilerLehmanHashing( - graph, self.wl_iterations, False, self.erase_base_features + graph=graph, + wl_iterations=self.wl_iterations, + use_node_attribute=None, + erase_base_features=self.erase_base_features, ) for graph in graphs ] @@ -125,7 +126,10 @@ def infer(self, graphs) -> np.array: graphs = [self._create_line_graph(graph) for graph in graphs] documents = [ WeisfeilerLehmanHashing( - graph, self.wl_iterations, False, self.erase_base_features + graph=graph, + wl_iterations=self.wl_iterations, + use_node_attribute=None, + erase_base_features=self.erase_base_features, ) for graph in graphs ] diff --git a/karateclub/graph_embedding/graph2vec.py b/karateclub/graph_embedding/graph2vec.py index cbb147bf..c95ad4d2 100644 --- a/karateclub/graph_embedding/graph2vec.py +++ b/karateclub/graph_embedding/graph2vec.py @@ -1,6 +1,6 @@ import numpy as np import networkx as nx -from typing import List +from typing import List, Optional from karateclub.estimator import Estimator from gensim.models.doc2vec import Doc2Vec, TaggedDocument from karateclub.utils.treefeatures import WeisfeilerLehmanHashing @@ -14,12 +14,12 @@ class Graph2Vec(Estimator): to generate representations for the graphs. The procedure assumes that nodes have no string feature present and the WL-hashing - defaults to the degree centrality. However, if a node feature with the key "feature" - is supported for the nodes the feature extraction happens based on the values of this key. + defaults to the degree centrality. However, if the parameter `use_node_attribute` is + provided, the feature extraction happens based on the values of this key. Args: wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2. - attributed (bool): Presence of graph attributes. Default is False. + use_node_attribute (Optional[str]): The optional parameter from which to load node features. Default is None.. dimensions (int): Dimensionality of embedding. Default is 128. workers (int): Number of cores. Default is 4. down_sampling (float): Down sampling frequency. Default is 0.0001. @@ -33,7 +33,7 @@ class Graph2Vec(Estimator): def __init__( self, wl_iterations: int = 2, - attributed: bool = False, + use_node_attribute: Optional[str] = None, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, @@ -45,7 +45,7 @@ def __init__( ): self.wl_iterations = wl_iterations - self.attributed = attributed + self.use_node_attribute = use_node_attribute self.dimensions = dimensions self.workers = workers self.down_sampling = down_sampling @@ -66,7 +66,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]): graphs = self._check_graphs(graphs) documents = [ WeisfeilerLehmanHashing( - graph, self.wl_iterations, self.attributed, self.erase_base_features + graph=graph, + wl_iterations=self.wl_iterations, + use_node_attribute=self.use_node_attribute, + erase_base_features=self.erase_base_features, ) for graph in graphs ] @@ -111,7 +114,10 @@ def infer(self, graphs) -> np.array: graphs = self._check_graphs(graphs) documents = [ WeisfeilerLehmanHashing( - graph, self.wl_iterations, self.attributed, self.erase_base_features + graph=graph, + wl_iterations=self.wl_iterations, + use_node_attribute=self.use_node_attribute, + erase_base_features=self.erase_base_features, ) for graph in graphs ] diff --git a/karateclub/node_embedding/structural/role2vec.py b/karateclub/node_embedding/structural/role2vec.py index 8100631f..d84230a8 100644 --- a/karateclub/node_embedding/structural/role2vec.py +++ b/karateclub/node_embedding/structural/role2vec.py @@ -126,7 +126,7 @@ def fit(self, graph: nx.classes.graph.Graph): hasher = WeisfeilerLehmanHashing( graph=graph, wl_iterations=self.wl_iterations, - attributed=False, + use_node_attribute=None, erase_base_features=self.erase_base_features, ) diff --git a/karateclub/utils/__init__.py b/karateclub/utils/__init__.py index 67fe2946..91b5525d 100644 --- a/karateclub/utils/__init__.py +++ b/karateclub/utils/__init__.py @@ -1,3 +1,7 @@ from .walker import RandomWalker from .diffuser import EulerianDiffuser from .treefeatures import WeisfeilerLehmanHashing + +__all__ = [ + "RandomWalker", "EulerianDiffuser", "WeisfeilerLehmanHashing" +] \ No newline at end of file diff --git a/karateclub/utils/treefeatures.py b/karateclub/utils/treefeatures.py index 8c64f93b..b3e42659 100644 --- a/karateclub/utils/treefeatures.py +++ b/karateclub/utils/treefeatures.py @@ -1,6 +1,7 @@ import hashlib import networkx as nx -from typing import List, Dict +from tqdm.auto import tqdm +from typing import List, Dict, Optional class WeisfeilerLehmanHashing(object): @@ -10,7 +11,7 @@ class WeisfeilerLehmanHashing(object): Args: graph (NetworkX graph): NetworkX graph for which we do WL hashing. wl_iterations (int): Number of WL iterations. - attributed (bool): Presence of attributes. + use_node_attribute (Optional[str]): Optional attribute name to be used. erase_base_feature (bool): Deleting the base features. """ @@ -18,7 +19,7 @@ def __init__( self, graph: nx.classes.graph.Graph, wl_iterations: int, - attributed: bool, + use_node_attribute: Optional[str], erase_base_features: bool, ): """ @@ -26,7 +27,7 @@ def __init__( """ self.wl_iterations = wl_iterations self.graph = graph - self.attributed = attributed + self.use_node_attribute = use_node_attribute self.erase_base_features = erase_base_features self._set_features() self._do_recursions() @@ -35,13 +36,54 @@ def _set_features(self): """ Creating the features. """ - if self.attributed: - self.features = nx.get_node_attributes(self.graph, "feature") + if self.use_node_attribute is not None: + # We retrieve the features of the nodes with the attribute name + # `feature` and assign them into a dictionary with structure: + # {node_a_name: feature_of_node_a} + # Nodes without this feature will not appear in the dictionary. + features = nx.get_node_attributes(self.graph, self.use_node_attribute) + + # We check whether all nodes have the requested feature + if len(features) != self.graph.number_of_nodes(): + missing_nodes = [] + # We find up to five missing nodes so to make + # a more informative error message. + for node in tqdm( + self.graph.nodes, + total=self.graph.number_of_nodes(), + leave=False, + dynamic_ncols=True, + desc="Searching for missing nodes" + ): + if node not in features: + missing_nodes.append(node) + if len(missing_nodes) > 5: + break + raise ValueError( + ( + "We expected for ALL graph nodes to have a node " + "attribute name `{}` to be used as part of " + "the requested embedding algorithm, but only {} " + "out of {} nodes has the correct attribute. " + "Consider checking for typos and missing values, " + "and use some imputation technique as necessary. " + "Some of the nodes without the requested attribute " + "are: {}" + ).format( + self.use_node_attribute, + len(features), + self.graph.number_of_nodes(), + missing_nodes + ) + ) + # If so, we assign the feature set. + self.features = features else: self.features = { node: self.graph.degree(node) for node in self.graph.nodes() } - self.extracted_features = {k: [str(v)] for k, v in self.features.items()} + self.extracted_features = {k: [str(v)] + for k, v in self.features.items()} def _erase_base_features(self): """ @@ -61,7 +103,8 @@ def _do_a_recursion(self): for node in self.graph.nodes(): nebs = self.graph.neighbors(node) degs = [self.features[neb] for neb in nebs] - features = [str(self.features[node])] + sorted([str(deg) for deg in degs]) + features = [str(self.features[node])] + \ + sorted([str(deg) for deg in degs]) features = "_".join(features) hash_object = hashlib.md5(features.encode()) hashing = hash_object.hexdigest() diff --git a/test/graph_embedding_test.py b/test/graph_embedding_test.py index f4f7261a..f9d8c667 100644 --- a/test/graph_embedding_test.py +++ b/test/graph_embedding_test.py @@ -1,5 +1,6 @@ import numpy as np import networkx as nx +import pytest from karateclub.graph_embedding import Graph2Vec, FGSD, GL2Vec, SF, IGE, LDP from karateclub.graph_embedding import NetLSD, GeoScattering, FeatherGraph @@ -146,7 +147,7 @@ def test_graph2vec(): nx.set_node_attributes(graph, {j: str(j) for j in range(75)}, "feature") new_graphs.append(graph) - model = Graph2Vec(attributed=True) + model = Graph2Vec(use_node_attribute="feature") model.fit(graphs) embedding = model.get_embedding() @@ -161,6 +162,13 @@ def test_graph2vec(): assert new_embedding.shape[1] == model.dimensions assert type(new_embedding) == np.ndarray + with pytest.raises(ValueError): + model = Graph2Vec( + use_node_attribute="missing_feature", + ) + + model.fit(new_graphs) + def test_gl2vec(): """ @@ -197,7 +205,6 @@ def test_gl2vec(): assert type(new_embedding) == np.ndarray - def test_ldp(): """ Test the LDP embedding.