-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
362 lines (295 loc) · 12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
工具函数
1.csc2spars_torch
csc、csr、coo 稀疏格式转 torch.sparse格式
2.load_gnn_data
导入 gnn 的 Cora 、Citeseer 、 Pubmed数据集
3.re_process_adj
Gnn预处理邻接矩阵,提高稳定性
4.save_model_Pytorch
保存 Pytorch 模型
5.evaluate_Pytorch
求准确率 accuracy
6.save_csv
文件IO
7.save_pickle
保存字典等变量
8.early_stop
提前停止
9.cos_similarity
求余弦相似度
10.Sparse_dropout_pytorch
解决grad 变为Fasle问题
11.save_txt
12.gen_log
生成log日志文件
13.tensorboard
绘制loss, acc曲线
"""
def csc2spars_torch(data_csc):
# data_csc 为 scipy.sparse.csc格式
import numpy as np
import torch
data_coo = data_csc.tocoo()
values = data_coo.data.astype(np.float32)
indices = np.vstack((data_coo.row, data_coo.col)).astype(np.int32)
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = data_coo.shape
return torch.sparse.FloatTensor(i, v, torch.Size(shape))
def load_gnn_data(data_path, dataset_str):
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import sys
def parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def sparse_to_tuple(sparse_mx):
"""Convert sparse matrix to tuple representation."""
def to_tuple(mx):
if not sp.isspmatrix_coo(mx):
mx = mx.tocoo()
coords = np.vstack((mx.row, mx.col)).transpose()
values = mx.data
shape = mx.shape
return coords, values, shape
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return sparse_to_tuple(features)
"""
Loads input data from gcn/data directory
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
(a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
All objects above must be saved using python pickle module.
:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
with open(data_path+"ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0):
objects.append(pkl.load(f, encoding='latin1'))
else:
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file(data_path+"ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
tx_extended[test_idx_range-min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range-min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y)+500)
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, idx_train, idx_test, idx_val
def pre_process_adj(adj):
import numpy as np
import scipy.sparse as sp
def sparse_to_tuple(sparse_mx):
"""Convert sparse matrix to tuple representation."""
def to_tuple(mx):
if not sp.isspmatrix_coo(mx):
mx = mx.tocoo()
coords = np.vstack((mx.row, mx.col)).transpose()
values = mx.data
shape = mx.shape
return coords, values, shape
if isinstance(sparse_mx, list):
for i in range(len(sparse_mx)):
sparse_mx[i] = to_tuple(sparse_mx[i])
else:
sparse_mx = to_tuple(sparse_mx)
return sparse_mx
def normalize_adj(adj):
"""Symmetrically normalize adjacency matrix."""
adj = sp.coo_matrix(adj)
rowsum = np.array(adj.sum(1))
d_inv_sqrt = np.power(rowsum, -0.5).flatten()
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
def preprocess_adj(adj):
"""Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
return sparse_to_tuple(adj_normalized)
def save_model_Pytorch(model, save_path, Model_structure):
import torch
# 方法一, 仅保存模型参数
# 保存
torch.save(model.state_dict(), save_path+'\parameter.pkl')
# 加载
model = Model_structure()
model.load_state_dict(torch.load(save_path+'\parameter.pkl'))
# 方法二, 保存模型结构及参数
# 保存
torch.save(model, '\model.pkl')
# 加载
model = torch.load('\model.pkl')
# 方法三, 加载checkpoint
pass
def evaluate_Pytorch():
pass
def save_csv():
# 1.numpy
import numpy as np
np.savetxt('frame',array,fmt='%d',delimiter=None) #frame: 文件 array:存入文件的数组
# #load csv
# np.loadtxt('frame',dtype=np.int,delimiter=None,unpack=False) # frame:文件 unpack:若为True,则读入属性奖分别写入不同变量
# 2.pandas
import pandas as pd
# 任意的多组列表
a = [1, 2, 3]
b = [4, 5, 6]
# 字典中的key值即为csv中列名
dataframe = pd.DataFrame({'a_name': a, 'b_name': b})
# 将DataFrame存储为csv,index表示是否显示行名,default=True
dataframe.to_csv("test.csv", index=False, sep=',', mode='a', header=None)
# load csv
# data = pd.read_csv('test.csv', sep=',')
def save_pickle(data,file_name):
import pickle as pkl
with open(file_name+'_indices.pkl', "wb") as f:
pkl.dump(data, f)
# load
with open(file_name + '_indices.pkl', "rb") as f:
data2 = pkl.load(f)
def early_stop():
# if train_loss < best_loss:
# if val_acc > best_val_acc:
# best_val_acc = val_acc
# best_acc = test_acc
# best_epoch = epoch
# best_loss = train_loss
# count = 0
# count += 1
# if count > 500:
# # print("best_epoch:{:d} | best_acc:{:.3f} | best_loss:{:.5f}".format(best_epoch,best_acc,best_loss))
# break
# print("best_epoch:{:d} | best_loss:{:.5f} | best_acc:{:.3f}".format(best_epoch, best_loss, best_acc))
pass
def cos_similarity(vector_a, vector_b):
import numpy as np
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = np.matmul(vector_a, vector_b.T) # [140,1433] * [1433,1000] = [140,1000]
# np.linalg.norm(data,axis=1) 计算每一行的L2范数
denom = np.matmul(np.linalg.norm(vector_a, axis=1)[:, np.newaxis], np.linalg.norm(vector_b, axis=1)[np.newaxis, :])
cos = num / denom
sim = 0.5 + 0.5 * cos # 将 cos 值 [-1,1] 归一化
# sim = cos
return sim
def Sparse_dropout_pytorch():
import torch.nn as nn
"""
class SparseDropout(nn.Module):
def __init__(self, p=0.5):
super(SparseDropout, self).__init__()
# p is ratio of dropout
# convert to keep probability
self.kprob = 1 - p
def forward(self, x):
if not self.training:
return x
mask = ((torch.rand(x._values().size()) + self.kprob).floor()).type(torch.bool)
rc = x._indices()[:, mask]
val = x._values()[mask] * (1.0 / self.kprob)
return torch.sparse.FloatTensor(rc, val, x.shape).to(x.device)
# 该版本解决了eval下dropout 问题,主要思路为将输入部分以概率p置零使得对应部分
# 权重无效,再将val值缩放,使得剩余部分总和与Dropout前一样
# 但是对于x为 require_grad情况下, grad变为False
"""
def sparse_dropout(x, rate, noise_shape):
"""
:param x:
:param rate:
:param noise_shape: int scalar
:return:
"""
random_tensor = 1 - rate
random_tensor += torch.rand(noise_shape).to(x.device)
dropout_mask = torch.floor(random_tensor).type(torch.bool)
# 这样改能保留梯度,但是计算速度稍慢
x = x.to_dense()
i = x._indices() # [2, 49216]
v = x[i[0], i[1]]
# [2, 4926] => [49216, 2] => [remained node, 2] => [2, remained node]
i = i[:, dropout_mask]
v = v[dropout_mask]
out = torch.sparse.FloatTensor(i, v, x.shape).to(x.device)
out = out * (1./ (1-rate))
return out
def save_txt():
f = open("data/model_Weight.txt",'a') #若文件不存在,系统自动创建。'a'表示可连续写入到文件,保留原内容,在原
#内容之后写入。可修改该模式('w+','w','wb'等)
f.write("hello,sha") #将字符串写入文件中
f.write("\n") #换行
# 下图很好的总结了这几种模式:
# https://www.runoob.com/wp-content/uploads/2013/11/2112205-861c05b2bdbc9c28.png
def gen_log():
import logging
logging.debug('this is debug message')
logging.info('this is info message')
logging.warning('this is warning message')
import logging # 引入logging模块
# 默认输出级别为 warning
log_fmt = "%(asctime)s|%(levelname)s|%(filename)s:%(lineno)s|%(message)s"
logging.basicConfig(filename='debug.log', filemode='w', format=log_fmt, level=logging.DEBUG)
# 将信息打印到控制台上
logging.debug(u"aaa")
logging.info(u"bbb")
logging.warning(u"ccc")
logging.error(u"ddd")
logging.critical(u"fff")
#打印结果:WARNING:root:this is warning message
def tensorboard():
from tensorboardX import SummaryWriter