forked from jaywalnut310/Label-Propagation-with-Seq2Seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch.py
121 lines (90 loc) · 3.47 KB
/
batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import networkx as nx
from embeddings_graph import EmbeddingsGraph
from data import KOEN
import unicodedata
graph = EmbeddingsGraph().graph
batch_size = 32
data = KOEN(batch_size, 'train')
data2 = KOEN(batch_size, 'train.mono')
with open('./data/raw/ko.train', 'r') as f:
ss_L = f.readlines()
with open('./data/raw/ko.train.mono', 'r') as f:
ss_U = f.readlines()
l = len(ss_L) #last index of labeled samples
u = l + len(ss_U) #last index of all samples
data.source.extend(data2.source)
data.source = np.array(data.source)
data.target = np.array(data.target)
def label(i):
if 0 <= i < l:
return data.target[i]
def next_batch(h_edges, start, finish):
"""
Helper function for the iterator, note that the neural graph machines,
due to its unique loss function, requires carefully crafted inputs
Refer to the Neural Graph Machines paper, section 3 and 3.3 for more details
"""
edges_ll = list()
edges_lu = list()
edges_uu = list()
weights_ll = list()
weights_lu = list()
weights_uu = list()
batch_edges = h_edges[start:finish]
batch_edges = np.asarray(batch_edges)
for i, j in batch_edges[:]:
if (0 <= i < l) and (0 <= j < l):
edges_ll.append((i, j))
weights_ll.append(graph.get_edge_data(i,j)['weight'])
elif (0 <= i < l) and (l <= j < u):
edges_lu.append((i, j))
weights_lu.append(graph.get_edge_data(i,j)['weight'])
else:
edges_uu.append((i, j))
weights_uu.append(graph.get_edge_data(i,j)['weight'])
if len(edges_ll)==0 or len(edges_lu)==0 or len(edges_uu)==0:
np.random.shuffle(h_edges[start:])
return next_batch(h_edges,start,finish)
u_ll = [e[0] for e in edges_ll]
# number of incident edges for nodes u
c_ull = [1 / len(graph.edges(n)) for n in u_ll]
v_ll = [e[1] for e in edges_ll]
c_vll = [1 / len(graph.edges(n)) for n in v_ll]
nodes_ll_u = data.source[u_ll]
if len(u_ll) != 0:
labels_ll_u = np.vstack([label(n) for n in u_ll])
else:
labels_ll_u = np.empty((0, data.target.shape[1]))
nodes_ll_v = data.source[v_ll]
if len(v_ll) != 0:
labels_ll_v = np.vstack([label(n) for n in v_ll])
else:
labels_ll_v = np.empty((0, data.target.shape[1]))
u_lu = [e[0] for e in edges_lu]
c_ulu = [1 / len(graph.edges(n)) for n in u_lu]
nodes_lu_u = data.source[u_lu]
nodes_lu_v = data.source[[e[1] for e in edges_lu]]
if len(u_lu) != 0:
labels_lu = np.vstack([label(n) for n in u_lu])
else:
labels_lu = np.empty((0, data.target.shape[1]))
nodes_uu_u = data.source[[e[0] for e in edges_uu]]
nodes_uu_v = data.source[[e[1] for e in edges_uu]]
return nodes_ll_u, nodes_ll_v, labels_ll_u, labels_ll_v, \
nodes_uu_u, nodes_uu_v, nodes_lu_u, nodes_lu_v, \
labels_lu, weights_ll, weights_lu, weights_uu, \
c_ull, c_vll, c_ulu
def batch_iter(batch_size):
"""
Generates a batch iterator for the dataset.
"""
data_size = len(graph.edges())
edges = np.random.permutation(graph.edges())
num_batches = int(data_size / batch_size)
if data_size % batch_size > 0:
num_batches = int(data_size / batch_size) + 1
for batch_num in range(num_batches):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield next_batch(edges,start_index,end_index)