Skip to content

Commit

Permalink
Updated documentation relative to the use of node attributes and rela…
Browse files Browse the repository at this point in the history
…tive tests
  • Loading branch information
LucaCappelletti94 committed Mar 21, 2023
1 parent 5d6b90f commit a430e85
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 25 deletions.
16 changes: 10 additions & 6 deletions karateclub/graph_embedding/gl2vec.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Optional
import numpy as np
import networkx as nx
from typing import List
Expand All @@ -14,10 +15,6 @@ class GL2Vec(Estimator):
these features a document (graph) - feature co-occurrence matrix is decomposed in order
to generate representations for the graphs.
The procedure assumes that nodes have no string feature present and the WL-hashing
defaults to the degree centrality. However, if a node feature with the key "feature"
is supported for the nodes the feature extraction happens based on the values of this key.
Args:
wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2.
dimensions (int): Dimensionality of embedding. Default is 128.
Expand All @@ -27,6 +24,7 @@ class GL2Vec(Estimator):
learning_rate (float): HogWild! learning rate. Default is 0.025.
min_count (int): Minimal count of graph feature occurrences. Default is 5.
seed (int): Random seed for the model. Default is 42.
erase_base_features: (bool): Whether to delete the base features.
"""

def __init__(
Expand Down Expand Up @@ -79,7 +77,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]):
graphs = [self._create_line_graph(graph) for graph in graphs]
documents = [
WeisfeilerLehmanHashing(
graph, self.wl_iterations, False, self.erase_base_features
graph=graph,
wl_iterations=self.wl_iterations,
use_node_attribute=None,
erase_base_features=self.erase_base_features,
)
for graph in graphs
]
Expand Down Expand Up @@ -125,7 +126,10 @@ def infer(self, graphs) -> np.array:
graphs = [self._create_line_graph(graph) for graph in graphs]
documents = [
WeisfeilerLehmanHashing(
graph, self.wl_iterations, False, self.erase_base_features
graph=graph,
wl_iterations=self.wl_iterations,
use_node_attribute=None,
erase_base_features=self.erase_base_features,
)
for graph in graphs
]
Expand Down
22 changes: 14 additions & 8 deletions karateclub/graph_embedding/graph2vec.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import networkx as nx
from typing import List
from typing import List, Optional
from karateclub.estimator import Estimator
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from karateclub.utils.treefeatures import WeisfeilerLehmanHashing
Expand All @@ -14,12 +14,12 @@ class Graph2Vec(Estimator):
to generate representations for the graphs.
The procedure assumes that nodes have no string feature present and the WL-hashing
defaults to the degree centrality. However, if a node feature with the key "feature"
is supported for the nodes the feature extraction happens based on the values of this key.
defaults to the degree centrality. However, if the parameter `use_node_attribute` is
provided, the feature extraction happens based on the values of this key.
Args:
wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2.
attributed (bool): Presence of graph attributes. Default is False.
use_node_attribute (Optional[str]): The optional parameter from which to load node features. Default is None..
dimensions (int): Dimensionality of embedding. Default is 128.
workers (int): Number of cores. Default is 4.
down_sampling (float): Down sampling frequency. Default is 0.0001.
Expand All @@ -33,7 +33,7 @@ class Graph2Vec(Estimator):
def __init__(
self,
wl_iterations: int = 2,
attributed: bool = False,
use_node_attribute: Optional[str] = None,
dimensions: int = 128,
workers: int = 4,
down_sampling: float = 0.0001,
Expand All @@ -45,7 +45,7 @@ def __init__(
):

self.wl_iterations = wl_iterations
self.attributed = attributed
self.use_node_attribute = use_node_attribute
self.dimensions = dimensions
self.workers = workers
self.down_sampling = down_sampling
Expand All @@ -66,7 +66,10 @@ def fit(self, graphs: List[nx.classes.graph.Graph]):
graphs = self._check_graphs(graphs)
documents = [
WeisfeilerLehmanHashing(
graph, self.wl_iterations, self.attributed, self.erase_base_features
graph=graph,
wl_iterations=self.wl_iterations,
use_node_attribute=self.use_node_attribute,
erase_base_features=self.erase_base_features,
)
for graph in graphs
]
Expand Down Expand Up @@ -111,7 +114,10 @@ def infer(self, graphs) -> np.array:
graphs = self._check_graphs(graphs)
documents = [
WeisfeilerLehmanHashing(
graph, self.wl_iterations, self.attributed, self.erase_base_features
graph=graph,
wl_iterations=self.wl_iterations,
use_node_attribute=self.use_node_attribute,
erase_base_features=self.erase_base_features,
)
for graph in graphs
]
Expand Down
2 changes: 1 addition & 1 deletion karateclub/node_embedding/structural/role2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def fit(self, graph: nx.classes.graph.Graph):
hasher = WeisfeilerLehmanHashing(
graph=graph,
wl_iterations=self.wl_iterations,
attributed=False,
use_node_attribute=None,
erase_base_features=self.erase_base_features,
)

Expand Down
4 changes: 4 additions & 0 deletions karateclub/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from .walker import RandomWalker
from .diffuser import EulerianDiffuser
from .treefeatures import WeisfeilerLehmanHashing

__all__ = [
"RandomWalker", "EulerianDiffuser", "WeisfeilerLehmanHashing"
]
59 changes: 51 additions & 8 deletions karateclub/utils/treefeatures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import hashlib
import networkx as nx
from typing import List, Dict
from tqdm.auto import tqdm
from typing import List, Dict, Optional


class WeisfeilerLehmanHashing(object):
Expand All @@ -10,23 +11,23 @@ class WeisfeilerLehmanHashing(object):
Args:
graph (NetworkX graph): NetworkX graph for which we do WL hashing.
wl_iterations (int): Number of WL iterations.
attributed (bool): Presence of attributes.
use_node_attribute (Optional[str]): Optional attribute name to be used.
erase_base_feature (bool): Deleting the base features.
"""

def __init__(
self,
graph: nx.classes.graph.Graph,
wl_iterations: int,
attributed: bool,
use_node_attribute: Optional[str],
erase_base_features: bool,
):
"""
Initialization method which also executes feature extraction.
"""
self.wl_iterations = wl_iterations
self.graph = graph
self.attributed = attributed
self.use_node_attribute = use_node_attribute
self.erase_base_features = erase_base_features
self._set_features()
self._do_recursions()
Expand All @@ -35,13 +36,54 @@ def _set_features(self):
"""
Creating the features.
"""
if self.attributed:
self.features = nx.get_node_attributes(self.graph, "feature")
if self.use_node_attribute is not None:
# We retrieve the features of the nodes with the attribute name
# `feature` and assign them into a dictionary with structure:
# {node_a_name: feature_of_node_a}
# Nodes without this feature will not appear in the dictionary.
features = nx.get_node_attributes(self.graph, self.use_node_attribute)

# We check whether all nodes have the requested feature
if len(features) != self.graph.number_of_nodes():
missing_nodes = []
# We find up to five missing nodes so to make
# a more informative error message.
for node in tqdm(
self.graph.nodes,
total=self.graph.number_of_nodes(),
leave=False,
dynamic_ncols=True,
desc="Searching for missing nodes"
):
if node not in features:
missing_nodes.append(node)
if len(missing_nodes) > 5:
break
raise ValueError(
(
"We expected for ALL graph nodes to have a node "
"attribute name `{}` to be used as part of "
"the requested embedding algorithm, but only {} "
"out of {} nodes has the correct attribute. "
"Consider checking for typos and missing values, "
"and use some imputation technique as necessary. "
"Some of the nodes without the requested attribute "
"are: {}"
).format(
self.use_node_attribute,
len(features),
self.graph.number_of_nodes(),
missing_nodes
)
)
# If so, we assign the feature set.
self.features = features
else:
self.features = {
node: self.graph.degree(node) for node in self.graph.nodes()
}
self.extracted_features = {k: [str(v)] for k, v in self.features.items()}
self.extracted_features = {k: [str(v)]
for k, v in self.features.items()}

def _erase_base_features(self):
"""
Expand All @@ -61,7 +103,8 @@ def _do_a_recursion(self):
for node in self.graph.nodes():
nebs = self.graph.neighbors(node)
degs = [self.features[neb] for neb in nebs]
features = [str(self.features[node])] + sorted([str(deg) for deg in degs])
features = [str(self.features[node])] + \
sorted([str(deg) for deg in degs])
features = "_".join(features)
hash_object = hashlib.md5(features.encode())
hashing = hash_object.hexdigest()
Expand Down
11 changes: 9 additions & 2 deletions test/graph_embedding_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import networkx as nx
import pytest
from karateclub.graph_embedding import Graph2Vec, FGSD, GL2Vec, SF, IGE, LDP
from karateclub.graph_embedding import NetLSD, GeoScattering, FeatherGraph

Expand Down Expand Up @@ -146,7 +147,7 @@ def test_graph2vec():
nx.set_node_attributes(graph, {j: str(j) for j in range(75)}, "feature")
new_graphs.append(graph)

model = Graph2Vec(attributed=True)
model = Graph2Vec(use_node_attribute="feature")

model.fit(graphs)
embedding = model.get_embedding()
Expand All @@ -161,6 +162,13 @@ def test_graph2vec():
assert new_embedding.shape[1] == model.dimensions
assert type(new_embedding) == np.ndarray

with pytest.raises(ValueError):
model = Graph2Vec(
use_node_attribute="missing_feature",
)

model.fit(new_graphs)


def test_gl2vec():
"""
Expand Down Expand Up @@ -197,7 +205,6 @@ def test_gl2vec():
assert type(new_embedding) == np.ndarray



def test_ldp():
"""
Test the LDP embedding.
Expand Down

0 comments on commit a430e85

Please sign in to comment.