preprocessors/bigdata.py

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import json
import os
from collections import defaultdict
from tqdm import tqdm


def get_uids_and_wav_paths(cfg, dataset, dataset_type):
    assert dataset == "bigdata"
    dataset_dir = os.path.join(
        cfg.OUTPUT_PATH,
        "preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
        "bigdata/{}".format(cfg.BIGDATA_VERSION),
    )
    dataset_file = os.path.join(
        dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
    )
    with open(dataset_file, "r") as f:
        utterances = json.load(f)

    # Uids
    uids = [u["Uid"] for u in utterances]

    # Wav paths
    wav_paths = [u["Path"] for u in utterances]

    return uids, wav_paths


def take_duration(utt):
    return utt["Duration"]


def main(output_path, cfg):
    datasets = cfg.dataset

    print("-" * 10)
    print("Preparing samples for bigdata...")
    print("Including: \n{}\n".format("\n".join(datasets)))

    datasets.sort()
    bigdata_version = "_".join(datasets)

    save_dir = os.path.join(output_path, bigdata_version)
    os.makedirs(save_dir, exist_ok=True)

    train_output_file = os.path.join(save_dir, "train.json")
    test_output_file = os.path.join(save_dir, "test.json")
    singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
    utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
    utt2singer = open(utt2singer_file, "a+")
    # We select songs of standard samples as test songs
    train = []
    test = []

    train_total_duration = 0
    test_total_duration = 0

    # Singer unique names
    singer_names = set()

    for dataset in datasets:
        dataset_path = os.path.join(output_path, dataset)
        train_json = os.path.join(dataset_path, "train.json")
        test_json = os.path.join(dataset_path, "test.json")

        with open(train_json, "r", encoding="utf-8") as f:
            train_utterances = json.load(f)

        with open(test_json, "r", encoding="utf-8") as f:
            test_utterances = json.load(f)

        for utt in tqdm(train_utterances):
            train.append(utt)
            train_total_duration += utt["Duration"]
            singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
            utt2singer.write(
                "{}_{}\t{}_{}\n".format(
                    utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
                )
            )

        for utt in test_utterances:
            test.append(utt)
            test_total_duration += utt["Duration"]
            singer_names.add("{}_{}".format(utt["Dataset"], utt["Singer"]))
            utt2singer.write(
                "{}_{}\t{}_{}\n".format(
                    utt["Dataset"], utt["Uid"], utt["Dataset"], utt["Singer"]
                )
            )

    utt2singer.close()

    train.sort(key=take_duration)
    test.sort(key=take_duration)
    print("#Train = {}, #Test = {}".format(len(train), len(test)))
    print(
        "#Train hours= {}, #Test hours= {}".format(
            train_total_duration / 3600, test_total_duration / 3600
        )
    )

    # Singer Look Up Table
    singer_names = list(singer_names)
    singer_names.sort()
    singer_lut = {name: i for i, name in enumerate(singer_names)}
    print("#Singers: {}\n".format(len(singer_lut)))

    # Save
    with open(train_output_file, "w") as f:
        json.dump(train, f, indent=4, ensure_ascii=False)
    with open(test_output_file, "w") as f:
        json.dump(test, f, indent=4, ensure_ascii=False)
    with open(singer_dict_file, "w") as f:
        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

    # Save meta info
    meta_info = {
        "datasets": datasets,
        "train": {"size": len(train), "hours": round(train_total_duration / 3600, 4)},
        "test": {"size": len(test), "hours": round(test_total_duration / 3600, 4)},
        "singers": {"size": len(singer_lut)},
    }
    singer2mins = defaultdict(float)
    for utt in train:
        dataset, singer, duration = utt["Dataset"], utt["Singer"], utt["Duration"]
        singer2mins["{}_{}".format(dataset, singer)] += duration / 60
    singer2mins = sorted(singer2mins.items(), key=lambda x: x[1], reverse=True)
    singer2mins = dict(
        zip([i[0] for i in singer2mins], [round(i[1], 2) for i in singer2mins])
    )
    meta_info["singers"]["training_minutes"] = singer2mins

    with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
        json.dump(meta_info, f, indent=4, ensure_ascii=False)

    for singer, min in singer2mins.items():
        print("Singer {}: {} mins".format(singer, min))
    print("-" * 10, "\n")