utils/find_corpus.py

#!/usr/bin/env python3
"""
Finds all opus datasets for a language pair and prints them to set config settings.

Usage:
    task find-corpus -- en ca
    task find-corpus -- en fr --importer opus
"""

import argparse
import logging
import re
import sys
from typing import NamedTuple, Optional, TypeVar, Union

import humanize
import requests


class OpusDataset(NamedTuple):
    # The name of this dataset, e.g. "CCAligned"
    corpus: str
    # This is a blank string at the time of this writing.
    documents: str

    # 'moses'
    preprocessing: str
    # The language tag.
    source: str
    # The language tag.
    target: str
    # The URL to the download
    url: str
    # For example "v1"
    version: str

    alignment_pairs: int
    id: int
    # Size in KiB
    size: int
    source_tokens: int
    target_tokens: int

    latest: Union["True", "False"]

    def corpus_key(self) -> str:
        return f"opus_{self.corpus}/{self.version}"

    def website_url(self) -> str:
        return f"https://opus.nlpl.eu/{self.corpus}/{self.source}&{self.target}/{self.version}/{self.corpus}"

    def humanize_size(self) -> str:
        return humanize.naturalsize(self.size * 1024)


def fetch_opus(source: str, target: str) -> list[OpusDataset]:
    # This API is documented: https://opus.nlpl.eu/opusapi/
    url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"

    datasets = requests.get(url).json()

    # Convert the response into a typed object that is sorted.
    datasets_typed = [OpusDataset(**corpus_data) for corpus_data in datasets.get("corpora", [])]
    return sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True)


def get_opus(source: str, target: str, download_url: bool):
    print("")
    print("┌──────────────────────────────┐")
    print("│ OPUS - https://opus.nlpl.eu/ │")
    print("└──────────────────────────────┘")

    datasets = fetch_opus(source, target)

    print_table(
        [
            [
                "Dataset",
                "Code",
                "Sentences",
                "Size",
                "URL",
            ],
            *[
                [
                    dataset.corpus,
                    dataset.corpus_key(),
                    dataset.alignment_pairs,
                    dataset.humanize_size(),
                    dataset.url if download_url else dataset.website_url(),
                ]
                for dataset in datasets
                if dataset.alignment_pairs
            ],
        ]
    )

    names = [dataset.corpus_key() for dataset in datasets]
    print_yaml(names, exclude=["OPUS100v", "WMT-News"])


def fetch_sacrebleu(source: str, target: str) -> dict[str, dict[str, any]]:
    import sacrebleu

    return {
        name: entry
        for name, entry in sacrebleu.DATASETS.items()
        if f"{source}-{target}" in entry or f"{target}-{source}" in entry
    }


def get_sacrebleu(source: str, target: str):
    datasets_dict = fetch_sacrebleu(source, target)

    print("")
    print("┌─────────────────────────────────────────────────┐")
    print("│ sacrebleu - https://github.com/mjpost/sacrebleu │")
    print("└─────────────────────────────────────────────────┘")
    print_table(
        [
            ["Dataset", "Description", "URLs"],
            *[
                [
                    #
                    name,
                    dataset["description"],
                    ", ".join(dataset["data"]),
                ]
                for name, dataset in datasets_dict.items()
            ],
        ]
    )
    print_yaml(list(f"sacrebleu_{name}" for name in datasets_dict.keys()))


def get_size(tags: list[str]) -> str:
    size = next(
        filter(
            lambda tag: tag.startswith("size_categories:"),
            tags,
        ),
        None,
    )

    if not size or size == "unknown":
        return ""

    # Lowercase the text since it's not consistent.
    return size.replace("size_categories:", "").lower()


def get_language_count(tags: list[str]):
    count = 0
    for tag in tags:
        if tag.startswith("language:"):
            count = count + 1
    return count


HF_DATASET_SIZES = {
    "": 0,
    "unknown": 0,
    "n<1k": 1,
    "1k<n<10k": 2,
    "10k<100k": 3,
    "10k<n<100k": 3,
    "100k<n<1m": 4,
    "1m<n<10m": 5,
    "10m<n<100m": 6,
    "100m<n<1b": 7,
    "1b<n<10b": 8,
    "10b<n<100b": 9,
    "100b<n<1t": 10,
}


def get_huggingface_monolingual(language: str):
    """
    Returns monolingual datasets ordered by size. Datasets with few downloads are ignored
    as they are probably low quality and not trustworthy.
    """
    from huggingface_hub import DatasetFilter, HfApi

    api = HfApi()

    datasets = list(
        api.list_datasets(
            filter=DatasetFilter(
                #
                language=language,
                multilinguality="monolingual",
            )
        )
    )
    datasets.sort(key=lambda dataset: -dataset.downloads)
    datasets.sort(key=lambda dataset: -HF_DATASET_SIZES.get(get_size(dataset.tags), 0))

    print("")
    print("┌─────────────────────────────────────────────────┐")
    print("│ huggingface monolingual data                    │")
    print("└─────────────────────────────────────────────────┘")
    print_table(
        [
            ["ID", "Size", "Downloads"],
            *[
                [
                    #
                    f"https://huggingface.co/datasets/{dataset.id}",
                    get_size(dataset.tags),
                    dataset.downloads,
                ]
                for dataset in datasets
                if is_useful_dataset(dataset)
            ],
        ]
    )


def get_huggingface_parallel(source: str, target: str):
    """
    Returns parallel datasets ordered by size. Datasets with few downloads are ignored
    as they are probably low quality and not trustworthy.
    """
    from huggingface_hub import DatasetFilter, HfApi

    api = HfApi()

    datasets = list(
        api.list_datasets(
            filter=DatasetFilter(
                #
                language=[source, target],
            )
        )
    )
    datasets.sort(key=lambda dataset: -dataset.downloads)
    datasets.sort(key=lambda dataset: -HF_DATASET_SIZES.get(get_size(dataset.tags), 0))

    print("")
    print(
        "┌────────────────────────────────────────────────────────────────────────────────────────────────────┐"
    )
    print(
        f"│ huggingface parallel data https://huggingface.co/datasets?language=language:{source},language:{target}"
    )
    print(
        "└────────────────────────────────────────────────────────────────────────────────────────────────────┘"
    )
    print_table(
        [
            ["ID", "Size", "Downloads"],
            *[
                [
                    #
                    f"https://huggingface.co/datasets/{dataset.id}",
                    get_size(dataset.tags),
                    dataset.downloads,
                ]
                for dataset in datasets
                if is_useful_dataset(dataset)
            ],
        ]
    )


def is_useful_dataset(dataset: any) -> bool:
    """Determines if a dataset is useful or not."""
    return "task_categories:automatic-speech-recognition" not in dataset.tags


def get_huggingface_any(language: str):
    """
    Returns parallel datasets ordered by size. Datasets with few downloads are ignored
    as they are probably low quality and not trustworthy.
    """
    from huggingface_hub import DatasetFilter, HfApi

    api = HfApi()

    datasets = list(
        api.list_datasets(
            filter=DatasetFilter(
                #
                language=language,
            )
        )
    )

    datasets.sort(key=lambda dataset: -dataset.downloads)
    datasets.sort(key=lambda dataset: -HF_DATASET_SIZES.get(get_size(dataset.tags), 0))

    print("")
    print("┌─────────────────────────────────────────────────────────────────────────────┐")
    print(f"│ huggingface any data https://huggingface.co/datasets?language=language:{language}")
    print("└─────────────────────────────────────────────────────────────────────────────┘")
    print_table(
        [
            ["ID", "Size", "Downloads"],
            *[
                [
                    #
                    f"https://huggingface.co/datasets/{dataset.id}",
                    get_size(dataset.tags),
                    dataset.downloads,
                ]
                for dataset in datasets
                if is_useful_dataset(dataset)
            ],
        ]
    )


def get_remote_file_size(
    url: str, display_not_200: bool = True
) -> tuple[Optional[int], Optional[str]]:
    try:
        response = requests.head(url, timeout=1, allow_redirects=True)

        if response.ok:
            if "Content-Length" in response.headers:
                int_size = int(response.headers.get("Content-Length", 0))
                return int_size, humanize.naturalsize(int_size)
            # Try again using GET.
        else:
            if display_not_200:
                print(f"Failed to retrieve file information for: {url}")
                print(f"Status code: {response.status_code}")
            return None, None

        # Sometimes when the HEAD does not have the Content-Length, the GET response does.
        response = requests.get(url, timeout=1, allow_redirects=True, stream=True)
        int_size = int(response.headers.get("Content-Length", 0))
        response.close()
        return int_size, humanize.naturalsize(int_size)

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None, None


T = TypeVar("T")

from mtdata.entry import Entry


def fetch_mtdata(source: str, target: str) -> dict[str, Entry]:
    """
    Returns a dict that maps the corpus key to the mtdata entry.
    """
    # mtdata outputs debug logs
    logging.disable(logging.CRITICAL)

    from mtdata.entry import lang_pair
    from mtdata.index import get_entries
    from mtdata.iso import iso3_code

    source_tricode = iso3_code(source, fail_error=True)
    target_tricode = iso3_code(target, fail_error=True)
    entries = sorted(
        get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True),
        key=lambda entry: entry.did.group,
    )

    def get_corpus_key(entry):
        return (
            f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}"
        )

    entries = {get_corpus_key(entry): entry for entry in entries}

    excludes = ["opus", "newstest", "unv1"]  # lowercase excludes.

    def is_excluded(corpus_key: str) -> bool:
        for exclude in excludes:
            if exclude in corpus_key.lower():
                return True
        return False

    # Filter out the excluded entries.
    return {
        corpus_key: entry for corpus_key, entry in entries.items() if not is_excluded(corpus_key)
    }


def get_mtdata(source: str, target: str):
    entries = fetch_mtdata(source, target)

    print("")
    print("┌────────────────────────────────────────────────┐")
    print("│ mtdata - https://github.com/thammegowda/mtdata │")
    print("└────────────────────────────────────────────────┘")
    print_table(
        [
            [
                "Dataset",
                "URL",
                # "Size",
            ],
            *[
                [
                    #
                    corpus_key,
                    entry.url,
                    # get_remote_file_size(entry.url),
                ]
                for corpus_key, entry in entries.items()
                # Filter out the excludes
            ],
        ]
    )

    print_yaml(entries.keys())


class MonoDataset(NamedTuple):
    name: str
    url: str
    size: Optional[int]
    display_size: Optional[int]
    lines_num: Optional[int]


def fetch_news_crawl(lang: str) -> list[MonoDataset]:
    base_url = f"https://data.statmt.org/news-crawl/{lang}/"
    response = requests.get(base_url, allow_redirects=True)

    datasets = []
    if response.ok:
        # Example row: (indentation and newlines added)
        # <tr>
        #     <td valign="top"><img src="/icons/compressed.gif" alt="[   ]"></td>
        #     <td><a href="news.2013.en.shuffled.deduped.gz">news.2013.en.shuffled.deduped.gz</a></td>
        #     <td align="right">2019-01-14 10:23  </td>
        #     <td align="right">1.2G</td>
        #     <td>&nbsp;</td>
        # </tr>

        regex = re.compile(
            r"""
            # Match the file name year.
            # >news.2008.en.shuffled.deduped.gz<
            #       ^^^^
            >news.(\d+)\.\w+\.shuffled\.deduped\.gz<
            [^\n]*

            # Match the file size and unit.
            # <td align="right">176M</td>
            #                   ^^^^
            <td\ align="right">
                ([\d\.]+)(\w+)
            </td>
        """,
            re.VERBOSE,
        )

        matches = re.findall(regex, response.text)

        if matches:
            for year, size_number, size_unit in matches:
                if size_unit == "K":
                    multiplier = 1_000
                elif size_unit == "M":
                    multiplier = 1_000_000
                elif size_unit == "G":
                    multiplier = 1_000_000_000

                name = f"news-crawl_news.{year}"
                url = f"https://data.statmt.org/news-crawl/{lang}/news.{year}.{lang}.shuffled.deduped.gz"
                size = int(float(size_number) * multiplier)

                datasets.append(MonoDataset(name, url, size, f"{size_number}{size_unit}", None))
        else:
            print("The regex could not find newscrawl datasets for", lang)
    else:
        print("No newscrawl data was available for", lang)
    return datasets


def get_news_crawl(source: str, target: str):
    for lang in (source, target):
        datasets = fetch_news_crawl(lang)

        print("")
        print("┌─────────────────────────────────────────────────────────────────────┐")
        print(f"│ news-crawl ({lang}) - https://github.com/data.statmt.org/news-crawl     │")
        print("└─────────────────────────────────────────────────────────────────────┘")
        print_table(
            [
                [
                    "Dataset",
                    "URL",
                    "Size",
                ],
                *[[name, url, display_size] for name, url, _, display_size, _ in datasets],
            ]
        )

        print_yaml([name for name, _, _, _, _ in datasets])


def fetch_hplt(lang: str, prefixes=("08", "09")) -> list[MonoDataset]:
    all_datasets = []
    for threshold in prefixes:
        for i in range(5):
            shard_id = i + 1
            base_url = f"https://storage.googleapis.com/releng-translations-dev/data/mono-hplt/{threshold}/hplt_filtered_{lang}_{shard_id}.count.txt"
            response = requests.get(base_url, allow_redirects=True)

            if response.ok:
                lines_number = int(response.content)
                url = f"https://storage.googleapis.com/releng-translations-dev/data/mono-hplt/{threshold}/hplt_filtered_{lang}_{shard_id}.txt.zst"
                dataset = MonoDataset(f"url_{url}", url, None, None, lines_number)
                all_datasets.append(dataset)

    return all_datasets


def get_hplt_mono(source: str, target: str):
    for lang in (source, target):
        datasets = fetch_hplt(lang)

        print("")
        print("┌─────────────────────────────────────────────────────────────────────┐")
        print(f"│ hplt mono ({lang}) - https://hplt-project.org/datasets/v1.2         │")
        print("└─────────────────────────────────────────────────────────────────────┘")
        print_table(
            [
                [
                    "Dataset",
                    "Number of lines",
                ],
                *[[name, lines] for name, _, _, _, lines in datasets],
            ]
        )

        print_yaml([name for name, _, _, _, _ in datasets])


def fetch_nllb_mono(
    lang: str,
) -> list[MonoDataset]:
    info_url = f"https://storage.googleapis.com/releng-translations-dev/data/mono-nllb/nllb-mono-{lang}.info.json"
    url = f"https://storage.googleapis.com/releng-translations-dev/data/mono-nllb/nllb-mono-{lang}.txt.zst"
    response = requests.get(info_url)

    if response.ok:
        info = response.json()
        sentences = info["sentences_kept"]
        assert sentences
        # There is only one file, but it's easier to return an array for the print_table call.
        return [MonoDataset(f"url_{url}", url, None, None, sentences)]

    return []


def get_nllb_mono(source: str, target: str):
    for lang in (source, target):
        datasets = fetch_nllb_mono(lang)

        print("")
        print("┌─────────────────────────────────────────────────────────────────────┐")
        print(f"│ nllp mono ({lang}) - https://opus.nlpl.eu/NLLB/corpus/version/NLLB  │")
        print("└─────────────────────────────────────────────────────────────────────┘")
        print_table(
            [
                [
                    "Dataset",
                    "Size",
                ],
                *[[name, display_size] for name, _, display_size, _, _ in datasets],
            ]
        )

        print_yaml([name for name, _, _, _, _ in datasets])


def print_yaml(names: list[str], exclude: list[str] = []):
    cleaned = set()
    for name in names:
        filter = False
        for ex in exclude:
            if ex.lower() in name.lower():
                filter = True
                break
        if not filter:
            cleaned.add(name)

    print("\nYAML:")
    if len(cleaned) == 0:
        print("(no datasets)\n")
    else:
        print("\n".join(sorted([f"    - {name}" for name in cleaned])))


def print_table(table: list[list[any]]):
    """
    Nicely print a table, the first row is the header
    """

    # Compute the column lengths.
    transposed_table = list(map(list, zip(*table)))
    column_lengths = [max(len(str(x)) for x in column) for column in transposed_table]

    print("")
    for index, row in enumerate(table):
        # Print the row.
        for datum, max_len in zip(row, column_lengths):
            print(str(datum).ljust(max_len), end=" ")
        print("")

        # Print a separator between the header and the rest of the table.
        if index == 0:
            for length in column_lengths:
                print("".ljust(length, "─"), end=" ")
            print("")

    if len(table) == 1:
        print("(no datasets)")


def main(args: Optional[list[str]] = None) -> None:
    importers = [
        "opus",
        "sacrebleu",
        "mtdata",
        "huggingface_mono",
        "huggingface_parallel",
        "huggingface_any",
        "news-crawl",
        "hplt-mono",
    ]
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawTextHelpFormatter,  # Preserves whitespace in the help text.
    )
    parser.add_argument("source", type=str, nargs="?", help="Source language code")
    parser.add_argument("target", type=str, nargs="?", help="Target language code")
    parser.add_argument(
        "--importer",
        type=str,
        help=f"The importer to use: {', '.join(importers)}",
    )
    parser.add_argument(
        "--download_url",
        action="store_true",
        default=False,
        help="Show the download url if available.",
    )

    args = parser.parse_args(args)

    if not args.source or not args.target:
        parser.print_help()
        sys.exit(1)

    if args.importer and args.importer not in importers:
        print(f'"{args.importer}" is not a valid importer.')
        sys.exit(1)

    if args.importer == "opus" or not args.importer:
        get_opus(args.source, args.target, args.download_url)

    if args.importer == "sacrebleu" or not args.importer:
        get_sacrebleu(args.source, args.target)

    if args.importer == "mtdata" or not args.importer:
        get_mtdata(args.source, args.target)

    if args.importer == "huggingface_mono" or not args.importer:
        get_huggingface_monolingual(args.target if args.source == "en" else args.source)

    if args.importer == "huggingface_parallel" or not args.importer:
        get_huggingface_parallel(args.source, args.target)

    if args.importer == "huggingface_any" or not args.importer:
        get_huggingface_any(args.target if args.source == "en" else args.source)

    if args.importer == "news-crawl" or not args.importer:
        get_news_crawl(args.source, args.target)

    if args.importer == "hplt-mono" or not args.importer:
        get_hplt_mono(args.source, args.target)

    if args.importer == "nllb-mono" or not args.importer:
        get_nllb_mono(args.source, args.target)


if __name__ == "__main__":
    main()