-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
260 lines (187 loc) · 7.65 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
from __future__ import print_function
from six.moves import xrange
import six.moves.cPickle as pickle
import gzip
import os
import numpy
import theano
import StringIO
def read_data_xy(readfilename):
f = open(readfilename,"r")#.txt file
x0 = []#list of list
x1 = []#list of list
x2 = []#list of list
y0 = []#list
y1 = []#list
y_one_out = []
allLines = f.readlines()
'''for tmp_line in f:
oneList = map(int,tmp_line.split(' '))
x.append(oneList[:-1])
y.append(oneList[-1])'''
f.close()
i = 0
while i < len(allLines):
x0.append(map(int ,allLines[i].split(' ')))
i += 1
x1.append(map(int ,allLines[i].split(' ')))
i += 1
x2.append(map(int ,allLines[i].split(' ')))
i += 1
labels = map(int ,allLines[i].split(' '))
y0.append(labels[0])
y1.append(labels[1])
y_one_out.append(labels[2])
i += 1
return x0, x1, x2, y0, y1, y_one_out
def produce_data(readfilenames,savefilename):
'''.txt files'''
tr_x0, tr_x1, tr_x2, tr_y0, tr_y1, tr_y_one = read_data_xy(readfilename[0])
va_x0, va_x1, va_x2, va_y0, va_y1, va_y_one = read_data_xy(readfilename[1])
te_x0, te_x1, te_x2, te_y0, te_y1, te_y_one = read_data_xy(readfilename[2])
data = ((tr_x0, tr_x1, tr_x2, tr_y0, tr_y1, tr_y_one),
(va_x0, va_x1, va_x2, va_y0, va_y1, va_y_one),
(te_x0, te_x1, te_x2, te_y0, te_y1, te_y_one))#tuple
f = open(savefilename,'wb')
pickle.dump(data,f)
f.close()
def prepare_data(seqs, addIdxNum=0, maxlen=None, win_size=1):
"""Create the matrices from the datasets.
This pad each sequence to the same lenght: the lenght of the
longuest sequence or maxlen.
if maxlen is set, we will cut all sequence to this maximum
lenght.
"""
# x: a list of sentences
lengths = [len(s) for s in seqs]
'''if maxlen is not None:
new_seqs = []
new_labels = []
new_lengths = []
for l, s, y in zip(lengths, seqs, labels):
if l < maxlen:
new_seqs.append(s)
new_labels.append(y)
new_lengths.append(l)
lengths = new_lengths
labels = new_labels
seqs = new_seqs
if len(lengths) < 1:
return None, None, None'''
n_samples = len(seqs)
maxlen = numpy.max(lengths)
'''
n_samples : numbers of sentences
'''
x = numpy.zeros((maxlen, n_samples)).astype('int32')
x_mask = numpy.zeros(((maxlen - addIdxNum) / win_size, n_samples)).astype(theano.config.floatX)
for idx, s in enumerate(seqs):
x[:lengths[idx], idx] = s
x_mask[:((lengths[idx] - addIdxNum) / win_size), idx] = 1.
#labels = numpy.asarray(labels).astype('int32')
return x, x_mask, maxlen - addIdxNum
def load_data(path, n_words, valid_portion=0.0, maxlen=None,
sort_by_len=False):
'''Loads the dataset
:type path: String
:param path: The path to the dataset
:type n_words: int
:param n_words: The number of word to keep in the vocabulary.
All extra words are set to unknow (1).
:type valid_portion: float
:param valid_portion: The proportion of the full train set used for
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
:type sort_by_len: bool
:name sort_by_len: Sort by the sequence lenght for the train,
valid and test set. This allow faster execution as it cause
less padding per minibatch. Another mechanism must be used to
shuffle the train set at each epoch.
'''
#############
# LOAD DATA #
#############
f = gzip.open(path, 'rb')
train_set, valid_set, test_set = pickle.load(f)
f.close()
'''if maxlen:
new_train_set_x = []
new_train_set_y = []
for x, y in zip(train_set[0], train_set[1]):
#if len(x) < maxlen:
new_train_set_x.append(x)
new_train_set_y.append(y)
train_set = (new_train_set_x, new_train_set_y)
del new_train_set_x, new_train_set_y'''
def remove_unk(x):
return [[1 if w >= n_words else w for w in sen] for sen in x]
'''test_set_x0, test_set_x1, test_set_x2, test_set_x3, test_set_y0, test_set_y1 = test_set
valid_set_x0, valid_set_x1, valid_set_x2, valid_set_x3, valid_set_y0, valid_set_y1 = valid_set
train_set_x0, train_set_x1, train_set_x2, train_set_x3, train_set_y0, train_set_y1 = train_set'''
'''train_set_x1 = remove_unk(train_set_x1)
train_set_x2 = remove_unk(train_set_x2)
train_set_x3 = remove_unk(train_set_x3)
valid_set_x1 = remove_unk(valid_set_x1)
valid_set_x2 = remove_unk(valid_set_x2)
valid_set_x3 = remove_unk(valid_set_x3)
test_set_x1 = remove_unk(test_set_x1)
test_set_x2 = remove_unk(test_set_x2)
test_set_x3 = remove_unk(test_set_x3)'''
def len_argsort(seq):
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
'''if sort_by_len:
sorted_index = len_argsort(test_set_x)
test_set_x0 = [test_set_x0[i] for i in sorted_index]
test_set_x1 = [test_set_x1[i] for i in sorted_index]
test_set_x2 = [test_set_x2[i] for i in sorted_index]
test_set_x3 = [test_set_x3[i] for i in sorted_index]
test_set_y0 = [test_set_y0[i] for i in sorted_index]
test_set_y1 = [test_set_y1[i] for i in sorted_index]
sorted_index = len_argsort(valid_set_x)
valid_set_x0 = [valid_set_x0[i] for i in sorted_index]
valid_set_x1 = [valid_set_x1[i] for i in sorted_index]
valid_set_x2 = [valid_set_x2[i] for i in sorted_index]
valid_set_x3 = [valid_set_x3[i] for i in sorted_index]
valid_set_y0 = [valid_set_y0[i] for i in sorted_index]
valid_set_y1 = [valid_set_y1[i] for i in sorted_index]
sorted_index = len_argsort(train_set_x)
train_set_x0 = [train_set_x0[i] for i in sorted_index]
train_set_x1 = [train_set_x1[i] for i in sorted_index]
train_set_x2 = [train_set_x2[i] for i in sorted_index]
train_set_x3 = [train_set_x3[i] for i in sorted_index]
train_set_y0 = [train_set_y0[i] for i in sorted_index]
train_set_y1 = [train_set_y1[i] for i in sorted_index]
train = (train_set_x0, train_set_x1, train_set_x2, train_set_x3, train_set_y0, train_set_y1)
valid = (valid_set_x0, valid_set_x1, valid_set_x2, valid_set_x3, valid_set_y0, valid_set_y1)
test = (test_set_x0, test_set_x1, test_set_x2, test_set_x3, test_set_y0, test_set_y1)'''
return train_set, valid_set, test_set
def read_embedding_file_to_get_matrix(filename, savefilename):
file_obj = open(filename,"r")
embeddings = []
for tmp_line in file_obj:
one_embedding = numpy.loadtxt(StringIO.StringIO(tmp_line))#matrix
embeddings.append(one_embedding)
matrix = numpy.asarray(embeddings)
file_obj.close()
f = open(savefilename,'wb')
pickle.dump(matrix,f)
f.close()
return matrix
def read_gz_file(filename):
f = gzip.open(filename,'rb')
data = pickle.load(f)
f.close()
return data
if __name__ == '__main__':
#d = read_data_xy("../train_idx.txt")
#print(type(d[3]))
##############################################################
readfilename = ["../train_idx.txt",
"../valid_idx.txt",
"../test_idx.txt"]
savefilename = '../mydata.pkl'
produce_data(readfilename,savefilename)
'''m_arr = read_embedding_file_to_get_matrix("../word_embed.txt",
"../../matrix.pkl")
print(m_arr.shape)'''