-
Notifications
You must be signed in to change notification settings - Fork 0
/
masc.py
518 lines (468 loc) · 19.4 KB
/
masc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
# Natural Language Toolkit: ISO GrAF Corpus Reader
#
# Copyright (C) 2001-2011 NLTK Project
# Author: Stephen Matysik <[email protected]>
# URL: <http://www.nltk.org/>
# For License informations, see LICENSE.TXT
"""
A reader for corpora that consist of documents in
the ISO GrAF format.
"""
from __future__ import print_function
import os.path
from .util import *
from .api import *
class MascCorpusReader(CorpusReader):
"""
Reader for corpora that consist of documents from MASC collection.
Paragraphs, sentences, words, nouns, verbs, and other annotations
are contained within MASC.
"""
# The corpus view class used be this reader
CorpusView = StreamBackedCorpusView
def __init__(self, root, fileids, encoding):
"""
Construct a new MASC corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = MascCorpusReader(root, r'(?!\.).*\.txt',
encoding = 'utf-8')
@param root: The root directory for this corpus.
@param fileids: A list of regexp specifying the fileids in
this corpus.
@param encoding: The encoding used for the text files in the corpus.
"""
self._cur_file = ""
self._cur_sents_file = ""
self._cur_paras_file = ""
self._cur_offsets = []
self._cur_sents_offsets = []
self._cur_paras_offsets = []
self._char_to_byte = {}
self._byte_to_char = {}
self._file_end = 0
if root.endswith('.zip'):
unzipped = self._get_basename(root)
if not os.path.exists(unzipped):
import zipfile
zfile = zipfile.ZipFile(root)
base = os.path.dirname(unzipped)
for name in zfile.namelist():
dirname, filename = os.path.split(name)
dirname = os.path.join(base, dirname)
if not os.path.exists(dirname):
os.mkdir(dirname)
try:
with open(os.path.join(dirname, filename), 'w') as f:
f.write(zfile.read(name))
except IOError:
pass
root = unzipped
CorpusReader.__init__(self, root, fileids, encoding)
self._fileids = [f for f in self._fileids if f.startswith('data/')]
def raw(self, fileids=None):
"""
@return: the given file(s) as a single string.
@rtype: C{str}
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, basestring):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
"""
@return: the given file(s) as a list of words
and punctuation symbols.
@rtype: C{list} of C{str}
"""
return concat([self.CorpusView(fileid, self._read_word_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def sents(self, fileids=None):
"""
@return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings
@rtype: C{list} of (C{list} of C{str})
"""
return concat([self.CorpusView(fileid, self._read_sent_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def paras(self, fileids=None):
"""
@return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
@rtype: C{list} of (C{list} of (C{list} of C{str}))
"""
return concat([self.CorpusView(fileid, self._read_para_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def nouns(self, fileids=None):
"""
@return: the given file(s) as a list of nouns
@rtype: C{list} of C{str}
"""
return concat([self.CorpusView(fileid, self._read_noun_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def verbs(self, fileids=None):
"""
@return: the given file(s) as a list of verbs
@rtype: C{list} of C{str}
"""
return concat([self.CorpusView(fileid, self._read_verb_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def persons(self, fileids=None):
"""
@return: the given file(s) as a list of persons
@rtype: C{list} of C{str}
"""
return concat([self.CorpusView(fileid, self._read_person_block,
encoding='utf-8')
for fileid in self.abspaths(fileids)])
def _get_basename(self, file_):
"""
@type file_: C{str}
@param file_: full filename
@return: the basename of the specified file
@rtype: C{str}
"""
return file_[:-4]
def _get_disc(self, stream):
"""
Using the specified file stream, this method creates two
discrepency mappings, both as dictionaries:
1. self._char_to_byte uses key = character number,
entry = byte number
2. self._byte_to_char uses key = byte number,
entry = character number
@type stream: StreamBackedCorpusView
@param stream: file stream
"""
self._char_to_byte = {}
self._byte_to_char = {}
stream.read()
file_end = stream.tell()
self._file_end = file_end
stream.seek(0)
for i in range(file_end+1):
if i != stream.tell():
self._char_to_byte[i] = stream.tell()
self._byte_to_char[stream.tell()] = i
stream.read(1)
stream.seek(0)
def _get_subset(self, offsets, offsets_start, offsets_end):
"""
@type offsets: C{list} of C{int} pairs
@param offsets: List of all offsets
@type offsets_start: C{int}
@param offsets_start: start of requested set of offsets
@type offsets_end: C{int}
@param offsets_end: end of requested set of offsets
@return: a list of all offsets between offsets_start and offset_end
@rtype: C{list} of C{str}
"""
subset = []
for i in offsets:
if (i[0] >= offsets_start and i[1] <= offsets_end and
i[0] != i[1]):
subset.append(i)
elif (i[0] >= offsets_start and i[1] > offsets_end and
i[0] != i[1] and i[0] <= offsets_end):
subset.append(i)
return subset
def _get_read_size(self, subset, char_to_byte, slimit, offset_end):
"""
@return: the byte size of text that should be read
next from the file stream
@rtype: C{int}
"""
if len(subset) != 0:
last1 = subset[len(subset)-1]
last = last1[1]
last = char_to_byte.get(last, last)
read_size = last - slimit
else:
elimit = char_to_byte.get(offset_end, offset_end)
read_size = elimit - slimit
return read_size
def _get_block(self, subset, text, offsets_start):
"""
Retrieve the annotated text, annotations are contained in subset
@type subset: C{list}
@param subset: list of annotation offset pairs
@type text: C{str}
@param text: text read from text stream
@type offsets_start: C{int}
@param offset_start: integer to correct for discrepency
between offsets and text character number
@return: list of annotated text
@rtype: C{list} of C{str}
"""
block = []
for s in subset:
start = s[0] - offsets_start
end = s[1] - offsets_start
chunk = text[start:end].encode('utf-8')
chunk = self._remove_ws(chunk)
block.append(chunk)
return block
def _read_block(self, stream, file_ending, label):
"""
Generic method for retrieving annotated text from a file stream.
@type stream: SeekableUnicodeStreamReader
@param stream: file stream from StreamBackedCorpusView in
corpus/reader/util.py
@type file_ending: C{str}
@param file_ending: xml annotation file containing annotation
offsets
@type label: C{str}
@param label: label of requested annotation
@return: list of annotated text from a block of the file stream
@rtype: C{list} of C{str}
"""
file_ = self._get_basename(stream.name) + file_ending
if file_ != self._cur_file:
self._cur_file = file_
offsets = self._get_annotation(file_, label)
self._cur_offsets = offsets
self._get_disc(stream)
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
else:
offsets = self._cur_offsets
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
slimit = stream.tell()
offset_slimit = byte_to_char.get(slimit, slimit)
offset_elimit = offset_slimit + 500
subset = self._get_subset(offsets, offset_slimit, offset_elimit)
read_size = self._get_read_size(subset, char_to_byte, slimit,
offset_elimit)
text = stream.read(read_size)
block = self._get_block(subset, text, offset_slimit)
return block
def _read_person_block(self, stream):
""" calls _read_block to retrieve 'person' tagged text """
return self._read_block(stream, '-ne.xml', 'person')
def _read_verb_block(self, stream):
""" calls _read_block to retrieve 'vchunk' (verb chunks)
tagged text """
return self._read_block(stream, '-vc.xml', 'vchunk')
def _read_noun_block(self, stream):
""" calls _read_block to retrieve 'nchunk' (noun chunks)
tagged text """
return self._read_block(stream, '-nc.xml', 'nchunk')
def _read_word_block(self, stream):
""" calls _read_block to retrieve 'tok' (words) tagged text """
return self._read_block(stream, '-penn.xml', 'tok')
def _read_sent_block(self, stream):
"""
Method for retrieving sentence annotations from text, and
the tok annotations within each sentence.
@type stream: SeekableUnicodeStreamReader
@param stream: file stream from StreamBackedCorpusView
in corpus/reader/util.py
@return: list of sentences, each of which is a list of words,
from a block of the file stream
@rtype: C{list} of C{str}
"""
file_ = self._get_basename(stream.name) + '-s.xml'
words_file = self._get_basename(stream.name) + '-penn.xml'
if not file_ == self._cur_sents_file:
self._cur_sents_file = file_
self._cur_words_file = words_file
offsets = self._get_annotation(file_, 's', 'head', 'q')
words_offsets = self._get_annotation(words_file, 'tok')
self._cur_sents_offsets = offsets
self._cur_words_offsets = words_offsets
self._get_disc(stream)
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
else:
offsets = self._cur_sents_offsets
words_offsets = self._cur_words_offsets
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
slimit = stream.tell()
offset_slimit = byte_to_char.get(slimit, slimit)
offset_elimit = offset_slimit + 500
subset = self._get_subset(offsets, offset_slimit, offset_elimit)
read_size = self._get_read_size(subset, char_to_byte,
slimit, offset_elimit)
text = stream.read(read_size)
block = []
for s in subset:
sent = []
for w in words_offsets:
if w[0] >= s[0] and w[1] <= s[1] and w[0] != w[1]:
start = w[0] - offset_slimit
end = w[1] - offset_slimit
chunk = text[start:end].encode('utf-8')
chunk = self._remove_ws(chunk)
sent.append(chunk)
block.append(sent)
return block
def _read_para_block(self, stream):
"""
Method for retrieving paragraph annotations from text,
and the sentence and word annotations within each paragraph.
@type stream: SeekableUnicodeStreamReader
@param stream: file stream from StreamBackedCorpusView
in corpus/reader/util.py
@return: list of paragraphs, each of which is a list of sentences,
each of which is a list of words,
from a block of the file stream
@rtype: C{list} of C{list} of C{str}
"""
file_ = self._get_basename(stream.name) + '-logical.xml'
sents_file = self._get_basename(stream.name) + '-s.xml'
words_file = self._get_basename(stream.name) + '-penn.xml'
if not file_ == self._cur_paras_file:
self._cur_paras_file = file_
self._cur_sents_file = sents_file
self._cur_words_file = words_file
offsets = self._get_annotation(file_, 'p')
sents_offsets = self._get_annotation(sents_file, 's', 'head', 'q')
words_offsets = self._get_annotation(words_file, 'tok')
self._cur_paras_offsets = offsets
self._cur_sents_offsets = sents_offsets
self._cur_words_offsets = words_offsets
self._get_disc(stream)
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
else:
offsets = self._cur_paras_offsets
sents_offsets = self._cur_sents_offsets
words_offsets = self._cur_words_offsets
char_to_byte = self._char_to_byte
byte_to_char = self._byte_to_char
# if len(offsets) == 0:
# print "No paragraph annotations for file " + file
# TODO skip file (advance file stream) if no tokens are found
slimit = stream.tell()
offset_slimit = byte_to_char.get(slimit, slimit)
offset_elimit = offset_slimit + 500
subset = []
for i in offsets:
if i[0] >= offset_slimit and i[1] <= offset_elimit and i[0] != i[1]:
subset.append(i)
if i[0] >= offset_slimit and i[1] > offset_elimit and i[0] != i[1]:
subset.append(i)
break
if len(subset) != 0:
last1 = subset[len(subset)-1]
last = last1[1]
last = char_to_byte.get(last, last)
read_size = last - slimit
text = stream.read(read_size)
else:
if offset_elimit < self._file_end:
elimit = char_to_byte.get(offset_elimit, offset_elimit)
read_size = elimit - slimit
text = stream.read(read_size)
else:
stream.read()
block = []
for p in subset:
para = []
for s in sents_offsets:
if s[0] >= p[0] and s[1] <= p[1] and s[0] != s[1]:
sent = []
for w in words_offsets:
if w[0] >= s[0] and w[1] <= s[1] and w[0] != w[1]:
start = w[0] - offset_slimit
end = w[1] - offset_slimit
chunk = text[start:end].encode('utf-8')
chunk = self._remove_ws(chunk)
sent.append(chunk)
para.append(sent)
if len(para) != 0: # If a paragraph has no internal
# sentence tokens, we disregard it
block.append(para)
return block
def _get_annotation(self, annfile, *labels):
"""
Parses the given annfile and returns the offsets of all
annotations of type 'label'
@type annfile: C{str}
@param annfile: xml file containing annotation offsets
@type labels: C{seq} of C{str}
@param labels: sequence of annotation type labels
@return: list of annotation offsets
@rtype: C{list} of C{pairs} of C{int}
"""
try:
import graf
except ImportError:
print("\ngraf-python is required to parse MASC xml files. "
"You can download with,\n\n pip install graf-python\n\n"
"More information is available at "
"http://media.cidles.eu/poio/graf-python/\n")
parser = graf.GraphParser()
g = parser.parse(annfile)
return sorted(pair for node in g.nodes
for pair in self._add_annotations(node, *labels))
def _add_annotations(self, node, *labels):
"""
Given a node and annotation label, this method calls
_get_offsets for each annotation contained by node,
and adds them to the return list if they are of type 'label'
@type node: C{Node}
@param node: a node in the Graf graph
@type labels: C{seq} of C{str}
@param labels: sequence of annotation type labels
@return: the annotation offsets of type 'label'
contained by the specified node
@rtype: C{list} of C{pairs} of C{int}
"""
node_offsets = []
for a in node.annotations:
if a.label in labels:
pair = self._get_offsets(node)
if pair is not None:
pair.sort()
node_offsets.append(pair)
return node_offsets
def _get_offsets(self, node):
"""
@type node: C{Node}
@param node: a node in the Graf graph
@return: the offsets contained by a given node
@rtype: C{pair} of C{int}, or C{None}
"""
if len(node.links) == 0 and node.out_edges != []:
offsets = []
edge_list = node.out_edges
for edge in reversed(edge_list):
temp_offsets = self._get_offsets(edge.to_node)
if temp_offsets is not None:
offsets.extend(self._get_offsets(edge.to_node))
if len(offsets) == 0:
return None
offsets.sort()
start = offsets[0]
end = offsets[len(offsets)-1]
return [start, end]
elif len(node.links) != 0:
offsets = []
for link in node.links:
for region in link:
for anchor in region.anchors:
offsets.append(anchor)
offsets.sort()
start = offsets[0]
end = offsets[len(offsets)-1]
return [start, end]
else:
return None
def _remove_ws(self, chunk):
"""
@return: string of text from chunk without end line characters
and multiple spaces
@rtype: C{str}
"""
return ' '.join(chunk.replace('\n', '').split())