-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
134 lines (103 loc) · 4.21 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# !/usr/bin/env python
# coding:utf-8
# Author: Qiao Ruijie
import json
import re
def data_trans(input_file):
with open(input_file, 'r') as f:
content = f.read()
#fw = open(output_file, 'w')
a = content.split('text')
origin_list = []
for i in range(1, len(a) - 1):
a[i] = a[i].rstrip(',\n {\n "')
a[i] = a[i].lstrip('"')
item = '{"text"' + a[i]
origin_list.append(json.loads(item))
a[-1] = a[-1].rstrip('\n]')
a[-1] = '{"text' + a[-1]
origin_list.append(json.loads(a[-1]))
new_list = []
text_list = []
for origin_data in origin_list:
text = origin_data['text']
if re.search(r'\[\d{4,}', text) or re.search(r'\d{4,}\]', text): #去掉【】内部连着数字的情况
#origin_list.pop(index)
continue
else:
text = re.sub(r'(\d+(?=\]))|((?<=\[)\d{,3})|(dis)|(sym)|(bod)|(ite)|[\[\]]', '', text)
text = re.sub(' ', '', text)
text_list.append(text)
#fw.write(text)
#fw.write('\n')
new_list.append(origin_data)
features = ['self','subject','body','decorate','frequency','item','disease']
fea_abbr = {'subject': 'SUB', 'body': 'BOD', 'decorate': 'DEC', 'frequency': 'FRE', 'item': 'ITE', 'disease': 'DIS'}
tag_list = []
for index, data in enumerate(new_list):
sym_list = data['symptom']
sent = text_list[index]
tag = ['O' for index in range(len(sent))]
for key in sym_list.keys():
if sym_list[key]['has_problem'] == True:
continue
else:
for i in features:
pos = sym_list[key][i]['pos']
val = sym_list[key][i]['val'].split()
for n in range(0, int(len(pos) / 2)):
try:
sub_val = val[n]
sub_pos = pos[n * 2:n * 2 + 2]
except IndexError:
# print("数组越界")
new_list.pop(index)
text_list.pop(index)
else:
if (sub_pos[1] - sub_pos[0] + 1) != len(sub_val):
# print('error1')
continue
elif sent[sub_pos[0]: sub_pos[1] + 1] != sub_val:
# print('error2')
continue
elif i != 'self':
tag[sub_pos[0]] = 'B-' + fea_abbr[i]
for t in range(sub_pos[0] + 1, sub_pos[1] + 1):
tag[t] = 'I-' + fea_abbr[i]
tag_list.append(tag)
#print(len(text_list))
#print(len(tag_list))
assert len(text_list) == len(tag_list)
return new_list, text_list, tag_list
def spo_generate(new_list, text_list):
data_list = []
for idx, value in enumerate(new_list):
data = {}
spo_list = []
data['text'] = text_list[idx]
sym_list = value['symptom']
for sym in sym_list.keys():
if sym_list[sym]['has_problem'] == True:
break
subject = sym_list[sym]['self']['val']
SUB_VAL = sym_list[sym]['subject']['val']
BOD_VAL = sym_list[sym]['body']['val']
DEC_VAL = sym_list[sym]['decorate']['val']
FRE_VAL = sym_list[sym]['frequency']['val']
ITE_VAL = sym_list[sym]['item']['val']
DIS_VAL = sym_list[sym]['disease']['val']
if SUB_VAL != '':
spo_list.append([subject, 'SUB', SUB_VAL])
if BOD_VAL != '':
spo_list.append([subject, 'BOD', BOD_VAL])
if DEC_VAL != '':
spo_list.append([subject, 'DEC', DEC_VAL])
if FRE_VAL != '':
spo_list.append([subject, 'FRE', FRE_VAL])
if ITE_VAL != '':
spo_list.append([subject, 'ITE', ITE_VAL])
if DIS_VAL != '':
spo_list.append([subject, 'DIS', DIS_VAL])
data['spo_list'] = spo_list
data_list.append(data)
return data_list