ukrainian_tts/formatter.py

from num2words import num2words
import re


def number_form(number):
    if number[-1] == "1":
        return 0
    elif number[-1] in ("2", "3", "4"):
        return 1
    else:
        return 2


CURRENCY = {
    "USD": ("долар", "долари", "доларів"),
    "UAH": ("гривня", "гривні", "гривень"),
    "EUR": ("євро", "євро", "євро"),
}


def replace_currency_with_words(text, currency, num_form):
    if currency == "USD":
        text = text.replace("$", CURRENCY[currency][num_form])

    if currency == "UAH":
        text = text.replace("₴", CURRENCY[currency][num_form])

    if currency == "EUR":
        text = text.replace("€", CURRENCY[currency][num_form])
    return text


def find_any_char(text: str, find: str, start: int):
    result = -1
    for c in find:
        index = text.find(c, start)
        if (index >= 0) and (result > index or result == -1):
            result = index

    return result


# Have to check if I can use https://github.com/lang-uk/tokenize-uk
def simple_tokenizer(text: str):
    start = 0
    index = find_any_char(text, " ,", start)
    while index >= 0:
        word = text[start:index]
        yield word
        separator = text[index]
        yield separator
        start = index + 1
        index = find_any_char(text, " ,", start)

    yield text[start:]


def preprocess_text(text):
    text = text.lower()
    # currencies
    if "$" in text:
        currency = "USD"
        gender = "masculine"
    elif "₴" in text:
        currency = "UAH"
        gender = "feminine"
    elif "€" in text:
        currency = "EUR"
        gender = "masculine"
    else:
        currency = ""
        gender = "masculine"

    num_form = 0
    # replace apostrophe
    text = text.replace("`", "'")
    text = text.replace("ʼ", "'")
    text = text.replace("…", "...")

    symbols = {
        "”": '"',
        "“": '"',
        "’": '"',
        "‘": '"',
        "«": '"',
        "»": '"',
        "–": "-",
        "—": "-",
        "―": "-",
    }
    for symbol, value in symbols.items():
        text = text.replace(symbol, value)
    # numbers
    text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)

    def detect_num_and_convert(word):
        numbers = "0123456789"
        splits = ",."
        currencies = "$₴€"
        result = []
        nonlocal num_form
        parts = word.split("-")  # for handling complex words
        for part in parts:
            is_number = all(map(lambda x: x in numbers, part)) or (
                any(map(lambda x: x in numbers, part))
                and any(map(lambda x: x in splits, part))
            )
            is_currency = any(map(lambda x: x in currencies, part)) and any(
                map(lambda x: x in numbers, part)
            )  # contains both number and currency symbol
            if is_number or is_currency:
                try:
                    if is_currency:
                        cleaned_part = part

                        for part_currency in currencies:
                            if cleaned_part[0] == part_currency:
                                cleaned_part = cleaned_part[1:] + " " + part_currency
                            else:
                                cleaned_part = cleaned_part.replace(
                                    part_currency, f" {part_currency} "
                                ).strip()  # TODO: replace with regex

                        part = " ".join(
                            [
                                detect_num_and_convert(part_word)
                                for part_word in cleaned_part.split(" ")
                            ]
                        )

                    ends_with_dot = part.endswith(".")  # ugly
                    ends_with_comma = part.endswith(",")
                    if ends_with_comma or ends_with_dot:
                        part = part[:-1]
                        part = " ".join(
                            [
                                detect_num_and_convert(part_word)
                                for part_word in part.split(" ")
                            ]
                        ) + ("." if ends_with_dot else ",")

                    num_form = number_form(part)
                    result.append(num2words(part.strip(), lang="uk", gender=gender))
                except:
                    result.append(part)
            else:
                result.append(part)
        return "-".join(result)

    # print([detect_num_and_convert(word) for word in simple_tokenizer(text)])
    text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)])
    text = replace_currency_with_words(text, currency, num_form)

    # fallback numbers
    text = text.replace("1", "один ")
    text = text.replace("2", "два ")
    text = text.replace("3", "три ")
    text = text.replace("4", "чотири ")
    text = text.replace("5", "п'ять ")
    text = text.replace("6", "шість ")
    text = text.replace("7", "сім ")
    text = text.replace("8", "вісім ")
    text = text.replace("9", "дев'ять ")
    text = text.replace("0", "нуль ")
    # speak english alphabet using brute force transliteration
    english = {
        "qu": "кв",
        "ch": "ч",
        "sh": "ш",
        "шч": "щ",  # after previous cases
        "ph": "ф",
        "kh": "х",
        "yo": "йо",
        "yu": "ю",
        "ya": "я",
        "ye": "є",
        "yi": "ї",
        "zh": "ж",
        "ts": "ц",
        "th": "т",
        "a": "а",
        "b": "б",
        "c": "ц",
        "d": "д",
        "e": "е",
        "f": "ф",
        "g": "ґ",
        "h": "г",
        "i": "і",
        "j": "дж",
        "k": "к",
        "l": "л",
        "m": "м",
        "n": "н",
        "o": "о",
        "p": "п",
        "q": "кв",
        "r": "р",
        "s": "с",
        "t": "т",
        "u": "ю",
        "v": "в",
        "w": "в",
        "x": "кс",
        "y": "і",
        "z": "з",
    }
    for english_char, english_value in english.items():
        # uppercase
        text = text.replace(english_char.upper(), english_value.upper())
        text = text.replace(english_char, english_value)

    return text