-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprocess_gdelt.py
124 lines (106 loc) · 4.08 KB
/
process_gdelt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pkg_resources
import os
import errno
from pathlib import Path
import pickle
import numpy as np
from collections import defaultdict
DATA_PATH = 'data/'
def prepare_dataset(path, name):
"""
Given a path to a folder containing tab separated files :
train, test, valid
In the format :
(lhs)\t(rel)\t(rhs)\t(timestamp)\n
Maps each entity and relation to a unique id, create corresponding folder
name in pkg/data, with mapped train/test/valid files.
Also create to_skip_lhs / to_skip_rhs for filtered metrics and
rel_id / ent_id for analysis.
"""
files = ['train', 'valid', 'test']
entities, relations, timestamps = set(), set(), set()
for f in files:
file_path = os.path.join(path, f)
to_read = open(file_path, 'r')
for line in to_read.readlines():
lhs, rel, rhs, timestamp = line.strip().split('\t')
entities.add(lhs)
entities.add(rhs)
relations.add(rel)
timestamps.add(timestamp)
to_read.close()
entities_to_id = {x: i for (i, x) in enumerate(sorted(entities))}
relations_to_id = {x: i for (i, x) in enumerate(sorted(relations))}
timestamps_to_id = {x: i for (i, x) in enumerate(sorted(timestamps))}
print("{} entities, {} relations over {} timestamps".format(len(entities), len(relations), len(timestamps)))
n_relations = len(relations)
n_entities = len(entities)
os.makedirs(os.path.join(DATA_PATH, name))
# write ent to id / rel to id
for (dic, f) in zip([entities_to_id, relations_to_id, timestamps_to_id], ['ent_id', 'rel_id', 'ts_id']):
ff = open(os.path.join(DATA_PATH, name, f), 'w+')
for (x, i) in dic.items():
ff.write("{}\t{}\n".format(x, i))
ff.close()
# map train/test/valid with the ids
for f in files:
file_path = os.path.join(path, f)
to_read = open(file_path, 'r')
examples = []
for line in to_read.readlines():
lhs, rel, rhs, ts = line.strip().split('\t')
try:
examples.append([entities_to_id[lhs], relations_to_id[rel], entities_to_id[rhs], timestamps_to_id[ts]])
except ValueError:
continue
out = open(Path(DATA_PATH) / name / (f + '.pickle'), 'wb')
pickle.dump(np.array(examples).astype('uint64'), out)
out.close()
print("creating filtering lists")
# create filtering files
to_skip = {'lhs': defaultdict(set), 'rhs': defaultdict(set)}
for f in files:
examples = pickle.load(open(Path(DATA_PATH) / name / (f + '.pickle'), 'rb'))
for lhs, rel, rhs, ts in examples:
to_skip['lhs'][(rhs, rel + n_relations, ts)].add(lhs) # reciprocals
to_skip['rhs'][(lhs, rel, ts)].add(rhs)
to_skip_final = {'lhs': {}, 'rhs': {}}
for kk, skip in to_skip.items():
for k, v in skip.items():
to_skip_final[kk][k] = sorted(list(v))
out = open(Path(DATA_PATH) / name / 'to_skip.pickle', 'wb')
pickle.dump(to_skip_final, out)
out.close()
examples = pickle.load(open(Path(DATA_PATH) / name / 'train.pickle', 'rb'))
counters = {
'lhs': np.zeros(n_entities),
'rhs': np.zeros(n_entities),
'both': np.zeros(n_entities)
}
for lhs, rel, rhs, _ts in examples:
counters['lhs'][lhs] += 1
counters['rhs'][rhs] += 1
counters['both'][lhs] += 1
counters['both'][rhs] += 1
for k, v in counters.items():
counters[k] = v / np.sum(v)
out = open(Path(DATA_PATH) / name / 'probas.pickle', 'wb')
pickle.dump(counters, out)
out.close()
if __name__ == "__main__":
datasets = ['GDELT']
for d in datasets:
print("Preparing dataset {}".format(d))
try:
prepare_dataset(
os.path.join(
os.path.dirname(os.path.realpath(__file__)), 'src_data', d
),
d
)
except OSError as e:
if e.errno == errno.EEXIST:
print(e)
print("File exists. skipping...")
else:
raise