-
-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathmusae.py
174 lines (156 loc) · 6.61 KB
/
musae.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""MUSAE model class."""
import json
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec
from walkers import FirstOrderRandomWalker, SecondOrderRandomWalker
from utils import load_graph, load_features, create_documents
class MUSAE:
"""
Multi-Scale Attributed Embedding class.
For details see the paper:
Multi-scale Attributed Node Embedding, Benedek Rozemberczki, Carl Allen, Rik Sarkar
https://arxiv.org/abs/1909.13021
"""
def __init__(self, args):
"""
MUSAE and AE machine constructor.
:param args: Arguments object with the model hyperparameters.
"""
self.args = args
self.log = dict()
self.graph = load_graph(args.graph_input)
self.features = load_features(args.features_input)
def do_sampling(self):
"""
Running a first or second-order random walk sampler.
Measuring the sampling runtime.
"""
self.log["walk_start_time"] = time.time()
if self.args.sampling == "second":
self.sampler = SecondOrderRandomWalker(self.graph,
self.args.P,
self.args.Q,
self.args.walk_number,
self.args.walk_length)
else:
self.sampler = FirstOrderRandomWalker(self.graph,
self.args.walk_number,
self.args.walk_length)
self.walks = self.sampler.walks
del self.sampler
self.log["walk_end_time"] = time.time()
def _create_single_embedding(self, features):
"""
Learning an embedding from a feature hash table.
:param features: A hash table with node keys and feature list values.
:return embedding: Numpy array of embedding.
"""
print("\nLearning the embedding.")
document_collections = create_documents(features)
model = Doc2Vec(document_collections,
vector_size=self.args.dimensions,
window=0,
min_count=self.args.min_count,
alpha=self.args.alpha,
dm=0,
negative=self.args.negative_samples,
ns_exponent=self.args.exponent,
min_alpha=self.args.min_alpha,
sample=self.args.down_sampling,
workers=self.args.workers,
epochs=self.args.epochs)
emb = np.array([model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())])
return emb
def _create_documents(self, features):
print("Creating documents.")
features_out = {}
for node, feature_set in tqdm(features.items(), total=len(features)):
features_out[str(node)] = [feat for feat_elems in feature_set for feat in feat_elems]
return features_out
def _setup_musae_features(self, approximation):
"""
Creating MUSAE feature set.
:param approximation: Approximation-order.
:return features: Feature hash-table.
"""
features = {str(node): [] for node in self.graph.nodes()}
print("Processing attributed walks.")
for walk in tqdm(self.walks):
for i in range(len(walk)-approximation):
source = walk[i]
target = walk[i+approximation]
features[str(source)].append(self.features[str(target)])
features[str(target)].append(self.features[str(source)])
return self._create_documents(features)
def _setup_ae_features(self):
"""
Create AE feature set.
:return features: Feature set hash table.
"""
features = {str(node):[] for node in self.graph.nodes()}
print("Processing attributed walks.")
for walk in tqdm(self.walks):
for i in range(len(walk)-self.args.approximation_order):
for j in range(self.args.approximation_order):
source = walk[i]
target = walk[i+j+1]
features[str(source)].append(self.features[str(target)])
features[str(target)].append(self.features[str(source)])
return self._create_documents(features)
def _print_approximation_order(self, approximation):
"""
Nice printing ofapproximation order for MUSAE.
:param approximation: Approximation order.
"""
print("\nApproximation order: " + str(approximation + 1) + ".\n")
def _learn_musae_embedding(self):
"""
Learning MUSAE embeddings up to the approximation order.
"""
for approximation in range(self.args.approximation_order):
self._print_approximation_order(approximation)
features = self._setup_musae_features(approximation+1)
embedding = self._create_single_embedding(features)
self.embeddings.append(embedding)
def _learn_ae_embedding(self):
"""
Learning an AE embedding.
"""
features = self._setup_ae_features()
embedding = self._create_single_embedding(features)
self.embeddings.append(embedding)
def learn_embedding(self):
"""
Learning the embeddings and measuring optimization runtime.
"""
self.log["optim_start_time"] = time.time()
self.embeddings = []
if self.args.base_model == "null":
embedding = self._create_single_embedding(self.features)
self.embeddings.append(embedding)
if self.args.model == "musae":
self._learn_musae_embedding()
else:
self._learn_ae_embedding()
self.embeddings = np.concatenate(self.embeddings, axis=1)
self.log["optim_end_time"] = time.time()
def save_embedding(self):
"""
Method to save the embedding.
"""
print("\nSaving embedding.\n")
columns = ["id"] + ["x_"+str(x) for x in range(self.embeddings.shape[1])]
ids = np.array(range(self.embeddings.shape[0])).reshape(-1, 1)
self.embeddings = np.concatenate([ids, self.embeddings], axis=1)
self.embeddings = pd.DataFrame(self.embeddings, columns=columns)
self.embeddings.to_csv(self.args.output, index=None)
def save_logs(self):
"""
Method to save the logs.
"""
print("Saving the logs.")
with open(self.args.log, "w") as f:
json.dump(self.log, f)