Skip to content

Commit

Permalink
Add cases for currencies
Browse files Browse the repository at this point in the history
  • Loading branch information
robinhad committed Aug 16, 2023
1 parent 85f14d6 commit 953871c
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 13 deletions.
5 changes: 4 additions & 1 deletion tests/test_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
"11100000001 доларів державного боргу.",
"одинадцять мільярдів сто мільйонів один доларів державного боргу.",
),
# this is wrong case, should be "це дев'ятнадцяти-річне вино."
("10000$, 15000 корупціонерів", "десять тисяч доларів , п'ятнадцять тисяч корупціонерів"), # TODO: fix space before comma
("$10000, 15000 корупціонерів", "доларів десять тисяч, п'ятнадцять тисяч корупціонерів"), # fix order
("10000$ у еквіваленті борщових заправок", "десять тисяч доларів у еквіваленті борщових заправок"),
# this is wrong case, should be "це дев'ятнадцятирічне вино."
# Implementing this, require to have proper parsing of words into the token stream
# which reqiure reworking of current approach.
("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
Expand Down
54 changes: 42 additions & 12 deletions ukrainian_tts/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ def number_form(number):
}


def replace_currency_with_words(text, currency, num_form):
if currency == "USD":
text = text.replace("$", CURRENCY[currency][num_form])

if currency == "UAH":
text = text.replace("₴", CURRENCY[currency][num_form])

if currency == "EUR":
text = text.replace("€", CURRENCY[currency][num_form])
return text

def preprocess_text(text):
text = text.lower()
# currencies
Expand Down Expand Up @@ -57,17 +68,33 @@ def preprocess_text(text):
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)

def detect_num_and_convert(word):
numbers = "0123456789,."
numbers = "0123456789"
splits = ",."
currencies = "$₴€"
result = []
nonlocal num_form
parts = word.split("-") # for handling complex words
for part in parts:
is_number = all(map(lambda x: x in numbers, part))
if is_number:
is_number = all(map(lambda x: x in numbers, part)) or (any(map(lambda x: x in numbers, part)) and any(map(lambda x: x in splits, part)))
is_currency = any(map(lambda x: x in currencies, part)) and any(map(lambda x: x in numbers, part)) # contains both number and currency symbol
if is_number or is_currency:
try:
if is_currency:
cleaned_part = part

for part_currency in currencies:
cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex

part = " ".join([detect_num_and_convert(part_word) for part_word in cleaned_part.split(" ")])

ends_with_dot = part.endswith(".") # ugly
ends_with_comma = part.endswith(",")
if ends_with_comma or ends_with_dot:
part = part[:-1]
part = " ".join([detect_num_and_convert(part_word) for part_word in part.split(" ")]) + ("." if ends_with_dot else ",")

num_form = number_form(part)
print("-" + part + "-" + str(num_form))
result.append(num2words(part, lang="uk", gender=gender))
result.append(num2words(part.strip(), lang="uk", gender=gender))
except:
result.append(part)
else:
Expand All @@ -76,14 +103,8 @@ def detect_num_and_convert(word):

# print([detect_num_and_convert(word) for word in text.split(" ")])
text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
if currency == "USD":
text = text.replace("$", CURRENCY[currency][num_form])

if currency == "UAH":
text = text.replace("₴", CURRENCY[currency][num_form])

if currency == "EUR":
text = text.replace("€", CURRENCY[currency][num_form])
text = replace_currency_with_words(text, currency, num_form)

# fallback numbers
text = text.replace("1", "один ")
Expand All @@ -101,8 +122,17 @@ def detect_num_and_convert(word):
"qu": "кв",
"ch": "ч",
"sh": "ш",
"шч": "щ", # after previous cases
"ph": "ф",
"kh": "х",
"yo": "йо",
"yu": "ю",
"ya": "я",
"ye": "є",
"yi": "ї",
"zh": "ж",
"ts": "ц",
"th": "т",
"a": "а",
"b": "б",
"c": "ц",
Expand Down

0 comments on commit 953871c

Please sign in to comment.