forked from thunlp/PL-Marker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_ontonotes.py
98 lines (87 loc) · 2.88 KB
/
preprocess_ontonotes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import conll
import collections
import re
import json
counter = collections.Counter()
def normalize_word(word, language='english'):
if language == "arabic":
word = word[:word.find("#")]
if word == "/." or word == "/?":
return word[1:]
else:
return word
def get_original_token(token):
escape_to_original = {
"-LRB-": "(",
"-RRB-": ")",
"-LSB-": "[",
"-RSB-": "]",
"-LCB-": "{",
"-RCB-": "}",
}
if token in escape_to_original:
token = escape_to_original[token]
return token
def prosess(prefix):
input_path = prefix + '.english.v4_gold_conll'
documents = []
with open(input_path, "r") as input_file:
for line in input_file.readlines():
begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line)
if begin_document_match:
doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2))
documents.append((doc_key, []))
elif line.startswith("#end document"):
continue
else:
documents[-1][1].append(line)
output_w = open(prefix + '.jsonl', 'w')
for document_lines in documents:
doc_key = document_lines[0]
sents = []
ners = []
sent = []
ner = []
word_idx = 0
last_word_idx = -1
ner_type = None
for line in document_lines[1]:
tok_info = line.strip().split()
if len(tok_info) == 0:
assert (last_word_idx==-1)
if len(sent) > 0:
sents.append(sent)
ners.append(ner)
sent = []
ner = []
continue
word = get_original_token(tok_info[3])
word = normalize_word(word)
label = tok_info[10] if (tok_info is not None and len(tok_info)>0) else '-'
if label != "*":
if label[0] == "(":
ner_type = label[1:-1]
if label[-1] == ')':
ner.append( (word_idx, word_idx, ner_type) )
else:
last_word_idx = word_idx
elif label=='*)':
ner.append( (last_word_idx, word_idx, ner_type) )
counter[ner_type] += 1
last_word_idx = -1
else:
assert(False)
sent.append(word)
word_idx += 1
assert(len(sent)==0)
item = {'sentences': sents,
'ner': ners,
'doc_key': doc_key
}
output_w.write(json.dumps(item)+'\n')
data_dir = 'ontonotes/'
prosess(data_dir + 'dev')
prosess(data_dir + 'test')
prosess(data_dir + 'train')
print (counter)
print (sorted(list(counter.keys())))