-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·150 lines (134 loc) · 4.83 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 1 15:25:28 2020
@author: antonio
"""
import time
import os
import pandas as pd
def parse_ann(datapath, labels_to_ignore = [], with_notes=False):
'''
DESCRIPTION: parse information in .ann files.
Parameters
----------
datapath: str
Route to the folder where the files are.
labels_to_ignore: list
Brat labels I will NOT parse.
with_notes: bool
Whether to store Brat AnnotatorNotes (#) or not.
Returns
-------
df: pandas DataFrame
It has information from ANN files. Columns: 'annotator', 'bunch',
'filename', 'mark', 'label', 'offset1', 'offset2', 'span', 'code'
'''
start = time.time()
info = []
## Iterate over the files and parse them
filenames = []
print(datapath)
for root, dirs, files in os.walk(datapath):
for filename in files:
if filename[-3:] != 'ann':
continue
info,_ = parse_one_ann(info, filenames, root, filename, labels_to_ignore,
ignore_related=False, with_notes=with_notes)
if with_notes == True:
cols = ['annotator','bunch','filename','mark','label','offset1',
'offset2','span','code']
else:
cols = ['annotator','bunch','filename','mark','label','offset1',
'offset2','span']
df = pd.DataFrame(info, columns=cols)
end = time.time()
print("Elapsed time: " + str(round(end-start, 2)) + 's')
return df
def parse_one_ann(ann_info, filenames, root, filename, labels_to_ignore,
ignore_related=True, with_notes=False):
'''
Parse one ANN file
'''
f = open(os.path.join(root,filename)).readlines()
filenames.append(filename)
# Get annotator and bunch
bunch = root.split('/')[-1]
annotator = root.split('/')[-2][-1]
# Parse .ann file
# Parse Relations (R)
related_marks = []
if ignore_related == True:
for line in f:
if line[0] != 'R':
continue
related_marks.append(line.split('\t')[1].split(' ')[1].split(':')[1])
related_marks.append(line.split('\t')[1].split(' ')[2].split(':')[1])
# Parse Notes (#)
if with_notes==True:
mark2code = {}
for line in f:
if line[0] != '#':
continue
line_split = line.split('\t')
mark2code[line_split[1].split(' ')[1]] = line_split[2].strip()
# Parse Entities (T)
for line in f:
if line[0] != 'T':
continue
splitted = line.split('\t')
if len(splitted)<3:
print('Line with less than 3 tabular splits:')
print(root + filename)
print(line)
print(splitted)
if len(splitted)>3:
print('Line with more than 3 tabular splits:')
print(root + filename)
print(line)
print(splitted)
mark = splitted[0]
if mark in related_marks: # Ignore related Entities
continue
label_offset = splitted[1].split(' ')
label = label_offset[0]
if label in labels_to_ignore: # Ignore labels
continue
offset = label_offset[1:]
span = splitted[2].strip('\n')
if with_notes==False:
ann_info.append([annotator, bunch, filename,mark, label,
offset[0], offset[-1], span.strip('\n')])
continue
if mark in mark2code.keys(): # Add Notes (code) information
code = mark2code[mark]
else:
code = ''
ann_info.append([annotator, bunch, filename,mark, label,
offset[0], offset[-1], span.strip('\n'), code])
return ann_info, filenames
def parse_tsv(in_path):
'''
Get information from ann that was already stored in a TSV file.
Parameters
----------
in_path: string
path to TSV file with 9 columns: ['annotator', 'bunch', 'filename',
'mark','label', 'offset1', 'offset2', 'span', 'code']
Additionally, we can also have the path to a 3 column TSV: ['code', 'label', 'span']
Returns
-------
df_annot: pandas DataFrame
It has 4 columns: 'filename', 'label', 'code', 'span'.
'''
df_annot = pd.read_csv(in_path, sep='\t', header=None)
if len(df_annot.columns) == 9:
df_annot.columns=['annotator', 'bunch', 'filename', 'mark',
'label', 'offset1', 'offset2', 'span', 'code']
elif len(df_annot.columns) == 8:
df_annot.columns=['annotator', 'bunch', 'filename', 'mark',
'label', 'offset1', 'offset2', 'span']
else:
df_annot.columns = ['code', 'span', 'label']
df_annot['filename'] ='xx'
return df_annot