-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathcleaners.py
115 lines (81 loc) · 3.71 KB
/
cleaners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
""" adapted from https://github.com/keithito/tacotron """
'''
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
import re
from string import punctuation
from functools import reduce
from unidecode import unidecode
from .numerical import normalize_numbers, normalize_currency
from .acronyms import AcronymNormalizer
from .datestime import normalize_datestime
from .letters_and_numbers import normalize_letters_and_numbers
from .abbreviations import normalize_abbreviations
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# Regular expression separating words enclosed in curly braces for cleaning
_arpa_re = re.compile(r'{[^}]+}|\S+')
def expand_abbreviations(text):
return normalize_abbreviations(text)
def expand_numbers(text):
return normalize_numbers(text)
def expand_currency(text):
return normalize_currency(text)
def expand_datestime(text):
return normalize_datestime(text)
def expand_letters_and_numbers(text):
return normalize_letters_and_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)
def separate_acronyms(text):
text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
return text
def convert_to_ascii(text):
return unidecode(text)
def dehyphenize_compound_words(text):
text = re.sub(r'(?<=[a-zA-Z0-9])-(?=[a-zA-Z])', ' ', text)
return text
def remove_space_before_punctuation(text):
return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r'\1', text)
class Cleaner(object):
def __init__(self, cleaner_names, phonemedict):
self.cleaner_names = cleaner_names
self.phonemedict = phonemedict
self.acronym_normalizer = AcronymNormalizer(self.phonemedict)
def __call__(self, text):
for cleaner_name in self.cleaner_names:
sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
for fn in sequence_fns:
text = fn(text)
text = [reduce(lambda x, y: y(x), word_fns, split)
if split[0] != '{' else split
for split in _arpa_re.findall(text)]
text = ' '.join(text)
text = remove_space_before_punctuation(text)
return text
def get_cleaner_fns(self, cleaner_name):
if cleaner_name == 'basic_cleaners':
sequence_fns = [lowercase, collapse_whitespace]
word_fns = []
elif cleaner_name == 'english_cleaners':
sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase]
word_fns = [expand_numbers, expand_abbreviations]
elif cleaner_name == 'radtts_cleaners':
sequence_fns = [collapse_whitespace, expand_currency,
expand_datestime, expand_letters_and_numbers]
word_fns = [expand_numbers, expand_abbreviations]
elif cleaner_name == 'transliteration_cleaners':
sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace]
else:
raise Exception("{} cleaner not supported".format(cleaner_name))
return sequence_fns, word_fns