-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetymology_text_getter.py
64 lines (47 loc) · 1.75 KB
/
etymology_text_getter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from typing import Dict
from data_structures import EtymologyText, RawEtymology
from re import search, Match
from extractors.reference_extractor import referenceStarts
class EtymologyTextGetter:
def __init__(self):
self.paragraphs = []
self.text = ''
self.DEFAULT = 'default'
def get(self, etymology: RawEtymology) -> EtymologyText:
self.text = etymology.text
self.paragraphs = self.text.split('\n')
return EtymologyText(lang=etymology.lang,
word=etymology.word,
toParse=self.getContentToParse(),
note=self.text)
def removeParagraph(self, text: str):
self.text = self.text.replace(text, '')
def getContentToParse(self) -> Dict[str, str]:
toParse = dict()
prefix = ''
prefixedCount = 0
for paragraph in self.paragraphs:
canParse = False
if paragraph.startswith('From'):
prefix = self.DEFAULT
canParse = True
prefixSearch = self.getPrefix(paragraph)
if prefixSearch:
prefix = prefixSearch.group(0)
prefixedCount += 1
if self.isReference(paragraph):
prefix = self.DEFAULT
canParse = True
if canParse:
toParse[prefix] = ' ' + paragraph
self.removeParagraph(paragraph)
return toParse
@staticmethod
def getPrefix(text: str) -> Match:
return search(r'/^(\w+) from|^The (.*?) from', text)
@staticmethod
def isReference(text: str) -> bool:
for start in referenceStarts:
if start in text:
return True
return False