Add cases for currencies

robinhad · Aug 16, 2023 · 953871c · 953871c
1 parent 85f14d6
commit 953871c
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 13 deletions.
diff --git a/tests/test_formatter.py b/tests/test_formatter.py
@@ -19,7 +19,10 @@
             "11100000001 доларів державного боргу.",
             "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
         ),
-        # this is wrong case, should be "це дев'ятнадцяти-річне вино."
+        ("10000$, 15000 корупціонерів", "десять тисяч доларів , п'ятнадцять тисяч корупціонерів"), # TODO: fix space before comma
+        ("$10000, 15000 корупціонерів", "доларів десять тисяч, п'ятнадцять тисяч корупціонерів"), # fix order
+        ("10000$ у еквіваленті борщових заправок", "десять тисяч доларів у еквіваленті борщових заправок"),
+        # this is wrong case, should be "це дев'ятнадцятирічне вино."
         # Implementing this, require to have proper parsing of words into the token stream
         # which reqiure reworking of current approach.
         ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),

diff --git a/ukrainian_tts/formatter.py b/ukrainian_tts/formatter.py
@@ -18,6 +18,17 @@ def number_form(number):
 }
 
 
+def replace_currency_with_words(text, currency, num_form):
+    if currency == "USD":
+        text = text.replace("$", CURRENCY[currency][num_form])
+
+    if currency == "UAH":
+        text = text.replace("₴", CURRENCY[currency][num_form])
+
+    if currency == "EUR":
+        text = text.replace("€", CURRENCY[currency][num_form])
+    return text
+
 def preprocess_text(text):
     text = text.lower()
     # currencies
@@ -57,17 +68,33 @@ def preprocess_text(text):
     text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
 
     def detect_num_and_convert(word):
-        numbers = "0123456789,."
+        numbers = "0123456789"
+        splits = ",."
+        currencies = "$₴€"
         result = []
         nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
-            is_number = all(map(lambda x: x in numbers, part))
-            if is_number:
+            is_number = all(map(lambda x: x in numbers, part)) or (any(map(lambda x: x in numbers, part)) and any(map(lambda x: x in splits, part)))
+            is_currency = any(map(lambda x: x in currencies, part)) and any(map(lambda x: x in numbers, part)) # contains both number and currency symbol
+            if is_number or is_currency:
                 try:
+                    if is_currency:
+                        cleaned_part = part
+
+                        for part_currency in currencies:
+                            cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex
+
+                        part = " ".join([detect_num_and_convert(part_word) for part_word in cleaned_part.split(" ")])
+
+                    ends_with_dot = part.endswith(".") # ugly
+                    ends_with_comma = part.endswith(",")
+                    if ends_with_comma or ends_with_dot:
+                        part = part[:-1]
+                        part = " ".join([detect_num_and_convert(part_word) for part_word in part.split(" ")]) + ("." if ends_with_dot else ",")
+
                     num_form = number_form(part)
-                    print("-" + part + "-" + str(num_form))
-                    result.append(num2words(part, lang="uk", gender=gender))
+                    result.append(num2words(part.strip(), lang="uk", gender=gender))
                 except:
                     result.append(part)
             else:
@@ -76,14 +103,8 @@ def detect_num_and_convert(word):
 
     # print([detect_num_and_convert(word) for word in text.split(" ")])
     text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
-    if currency == "USD":
-        text = text.replace("$", CURRENCY[currency][num_form])
 
-    if currency == "UAH":
-        text = text.replace("₴", CURRENCY[currency][num_form])
-
-    if currency == "EUR":
-        text = text.replace("€", CURRENCY[currency][num_form])
+    text = replace_currency_with_words(text, currency, num_form)
 
     # fallback numbers
     text = text.replace("1", "один ")
@@ -101,8 +122,17 @@ def detect_num_and_convert(word):
         "qu": "кв",
         "ch": "ч",
         "sh": "ш",
+        "шч": "щ", # after previous cases
         "ph": "ф",
         "kh": "х",
+        "yo": "йо",
+        "yu": "ю",
+        "ya": "я",
+        "ye": "є",
+        "yi": "ї",
+        "zh": "ж",
+        "ts": "ц",
+        "th": "т",
         "a": "а",
         "b": "б",
         "c": "ц",