-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathFeatureGraphDataset.py
96 lines (79 loc) · 3.85 KB
/
FeatureGraphDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
import random
from torch import LongTensor, Tensor
class FeatureGraphDataset(object):
def __init__(self, features, label, adj):
'''Initalization
Manually initalize a feature and graph dataset.
Args:
features: numpy ndarray, [[f1, f2, ...], [f1, f2, ...]]
label: numpy ndarray, [0, 1, 2, 0, ...], label[i] == -1 if its class is unknow
adj: dict of (int, list of int), {[1,2],[0,3],...}
'''
assert len(features) == len(label)
assert type(features) == type(label) == np.ndarray
self.features, self.label = features, label
self.n = len(features) # num of instances
self.m = np.max(label) + 1 # num of classes
self.k = features.shape[1] # num of features
self.adj = adj
ratio = 0.5
for k, v in adj.items():
s = len(v)
adj_features = reduce(lambda x,y: x + y, [self.features[y] for y in v])
self.features[k] = self.features[k] * ratio + adj_features * (1 - ratio) / s
def setting(self, label_num_per_class, test_num):
'''Set label data and test set in semi-supervised learning
Label data and test set should be settled at first.
'''
self.test_ids = random.sample(range(self.n), test_num)
remains = set(range(self.n)) - set(self.test_ids)
num_of_class = [0] * self.m
self.label_ids = []
for i in remains:
if num_of_class[self.label[i]] < label_num_per_class:
self.label_ids.append(i)
num_of_class[self.label[i]] += 1
self.unlabel_ids = list(set(range(self.n)) - set(self.label_ids))
self.test_num, self.label_num = test_num, sum(num_of_class)
def label_batch(self, batch_size, tensor = True):
'''Return a batch of label data features
Random sample from label data
Return:
tuple: ([id0, id1, ...], [[f1, f2, ...], ...(batch_size)](type: numpy.ndarray), [0,1,2,...(batch_size)](type: numpy.ndarray))
'''
assert(len(self.label_ids) >= batch_size)
ids = random.sample(self.label_ids, batch_size)
return (LongTensor(ids), Tensor(self.features[ids]), LongTensor(self.label[ids])) if tensor else (ids, self.features[ids], self.label[ids])
def unlabel_batch(self, batch_size, tensor = True):
'''Return a batch of unlabel data features
Random sample from label data
Return:
tuple: ([id0, ...], [[f1, f2, ...], ...(batch_size)](type: numpy.ndarray))
'''
if batch_size == -1:
ids = self.unlabel_ids
else:
ids = random.sample(self.unlabel_ids, batch_size)
return (LongTensor(ids), Tensor(self.features[ids])) if tensor else (ids, self.features[ids])
def test_batch(self, batch_size = -1, tensor = True):
if batch_size == -1:
ids = self.test_ids
else:
ids = random.sample(self.test_ids, batch_size)
return (LongTensor(ids), Tensor(self.features[ids]), LongTensor(self.label[ids])) if tensor else (ids, self.features[ids], self.label[ids])
def adj_batch(self, batch, tensor = True):
ids = [random.choice(self.adj[i]) for i in batch]
return (LongTensor(ids), Tensor(self.features[ids])) if tensor else (ids, self.features[ids])
def read_embbedings(self, embbeding_file):
'''read graph embbedings from file
Read graph embbedings generated by OpenNE system.
'''
with open(embbeding_file, 'r') as f:
lines = f.readlines()
n, self.d = [int(i) for i in lines[0].split()]
assert n == self.n
self.embbedings = np.zeros((n, self.d))
for line in lines[1:]:
line = line.split()
self.embbedings[int(line[0])] = [float(i) for i in line[1:]]