forked from open-mmlab/Amphion
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add preprocessing scripts for the librilight datasets (open-mmlab#107)
* Add preprocessor for librilight dataset
- Loading branch information
Showing
6 changed files
with
700 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,329 @@ | ||
# Copyright (c) 2023 Amphion. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import json | ||
from tqdm import tqdm | ||
import os | ||
import torchaudio | ||
import torch | ||
|
||
|
||
from utils.mfa_prepare import ( | ||
process_wav_files, | ||
get_wav_files, | ||
filter_wav_files_by_length, | ||
) | ||
from utils.cut_by_vad import cut_segments | ||
from utils.whisper_transcription import asr_main | ||
from utils.util import has_existed | ||
|
||
import subprocess | ||
import random | ||
from collections import defaultdict | ||
from glob import glob | ||
import shutil | ||
|
||
|
||
def librilight_statistics(data_dir): | ||
"""Get statistics for librilight dataset""" | ||
distribution2speakers2utts = defaultdict(lambda: defaultdict(list)) | ||
distribution_infos = glob(data_dir + "/*") | ||
for distribution_info in distribution_infos: | ||
distribution = distribution_info.split("/")[-1] | ||
print(distribution) | ||
speaker_infos = glob(distribution_info + "/*") | ||
if len(speaker_infos) == 0: | ||
continue | ||
for speaker_info in speaker_infos: | ||
speaker = speaker_info.split("/")[-1] | ||
utts = glob(speaker_info + "/*.wav") | ||
for utt in utts: | ||
uid = utt.split("/")[-1].split(".")[0] | ||
distribution2speakers2utts[distribution][speaker].append(uid) | ||
return distribution2speakers2utts | ||
|
||
|
||
def get_speakers_from_directory(directory): | ||
return [ | ||
d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d)) | ||
] | ||
|
||
|
||
def split_dataset_by_speaker(base_dir, train_ratio=0.8, dev_ratio=0.1): | ||
train_dir = os.path.join(base_dir, "train") | ||
dev_dir = os.path.join(base_dir, "dev") | ||
eval_dir = os.path.join(base_dir, "eval") | ||
|
||
# Check if dataset is already split | ||
if has_existed(train_dir) or has_existed(dev_dir) or has_existed(eval_dir): | ||
print("Dataset already split. Calculating speakers...") | ||
train_speakers = get_speakers_from_directory(train_dir) | ||
dev_speakers = get_speakers_from_directory(dev_dir) | ||
eval_speakers = get_speakers_from_directory(eval_dir) | ||
all_speakers = train_speakers + dev_speakers + eval_speakers | ||
unique_speakers = list(set(all_speakers)) | ||
unique_speakers.sort() | ||
return unique_speakers | ||
|
||
# List all directories in the base directory | ||
all_speakers = [ | ||
d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) | ||
] | ||
random.shuffle(all_speakers) | ||
|
||
# Calculate split sizes | ||
total_speakers = len(all_speakers) | ||
train_size = int(total_speakers * train_ratio) | ||
dev_size = int(total_speakers * dev_ratio) | ||
eval_size = total_speakers - train_size - dev_size | ||
print("Total speakers:", total_speakers) | ||
print("Train speakers:", train_size) | ||
print("Dev speakers:", dev_size) | ||
print("Eval speakers:", eval_size) | ||
|
||
# Split directories | ||
train_speakers = all_speakers[:train_size] | ||
dev_speakers = all_speakers[train_size : train_size + dev_size] | ||
eval_speakers = all_speakers[train_size + dev_size :] | ||
|
||
# Function to move directories | ||
def move_speakers(speakers, target_dir): | ||
for speaker in speakers: | ||
shutil.move( | ||
os.path.join(base_dir, speaker), os.path.join(target_dir, speaker) | ||
) | ||
|
||
# Move directories | ||
print("Moving directories...") | ||
print("Moving Train speakers...") | ||
move_speakers(train_speakers, train_dir) | ||
print("Moving Dev speakers...") | ||
move_speakers(dev_speakers, dev_dir) | ||
print("Moving Eval speakers...") | ||
move_speakers(eval_speakers, eval_dir) | ||
|
||
unique_speakers = list(set(all_speakers)) | ||
unique_speakers.sort() | ||
return unique_speakers | ||
|
||
|
||
def save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers): | ||
"""Save metadata for librilight dataset""" | ||
os.makedirs(save_dir, exist_ok=True) | ||
train_output_file = os.path.join(save_dir, "train.json") | ||
valid_output_file = os.path.join(save_dir, "dev.json") | ||
test_output_file = os.path.join(save_dir, "eval.json") | ||
singer_dict_file = os.path.join(save_dir, "singers.json") | ||
utt2singer_file = os.path.join(save_dir, "utt2singer") | ||
utt2singer = open(utt2singer_file, "w") | ||
if has_existed(train_output_file): | ||
print("Metadata already exists. Skipping...") | ||
return | ||
|
||
train = [] | ||
test = [] | ||
valid = [] | ||
|
||
train_index_count = 0 | ||
test_index_count = 0 | ||
valid_index_count = 0 | ||
|
||
train_total_duration = 0 | ||
test_total_duration = 0 | ||
valid_total_duration = 0 | ||
|
||
# Save metadata | ||
for distribution, speakers2utts in tqdm(distribution2speakers2utts.items()): | ||
for speaker, utts in tqdm(speakers2utts.items()): | ||
for chosen_uid in utts: | ||
res = { | ||
"Dataset": "librilight", | ||
"Singer": speaker, | ||
"Uid": "{}#{}#{}".format(distribution, speaker, chosen_uid), | ||
} | ||
res["Path"] = "{}/{}/{}.wav".format(distribution, speaker, chosen_uid) | ||
res["Path"] = os.path.join(processed_dir, res["Path"]) | ||
assert os.path.exists(res["Path"]) | ||
|
||
text_file_path = os.path.join( | ||
processed_dir, | ||
distribution, | ||
speaker, | ||
chosen_uid + ".txt", | ||
) | ||
with open(text_file_path, "r") as f: | ||
lines = f.readlines() | ||
assert len(lines) == 1 | ||
text = lines[0].strip() | ||
res["Text"] = text | ||
|
||
waveform, sample_rate = torchaudio.load(res["Path"]) | ||
duration = waveform.size(-1) / sample_rate | ||
res["Duration"] = duration | ||
|
||
if "train" in distribution: | ||
res["index"] = train_index_count | ||
train_total_duration += duration | ||
train.append(res) | ||
train_index_count += 1 | ||
elif "dev" in distribution: | ||
res["index"] = valid_index_count | ||
valid_total_duration += duration | ||
valid.append(res) | ||
valid_index_count += 1 | ||
elif "eval" in distribution: | ||
res["index"] = test_index_count | ||
test_total_duration += duration | ||
test.append(res) | ||
test_index_count += 1 | ||
utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"])) | ||
print("Done!") | ||
print( | ||
"Utterance count: train = {}, dev = {}, eval = {}".format( | ||
len(train), len(valid), len(test) | ||
) | ||
) | ||
print( | ||
"#Train duration= {}, #Dev duration= {}, #Eval duration= {}".format( | ||
train_total_duration / 3600, | ||
valid_total_duration / 3600, | ||
test_total_duration / 3600, | ||
) | ||
) | ||
with open(train_output_file, "w") as f: | ||
json.dump(train, f, indent=4, ensure_ascii=False) | ||
with open(test_output_file, "w") as f: | ||
json.dump(test, f, indent=4, ensure_ascii=False) | ||
with open(valid_output_file, "w") as f: | ||
json.dump(valid, f, indent=4, ensure_ascii=False) | ||
utt2singer.close() | ||
singer_lut = {name: i for i, name in enumerate(speakers)} | ||
with open(singer_dict_file, "w") as f: | ||
json.dump(singer_lut, f, indent=4, ensure_ascii=False) | ||
print("Metadata saved to", save_dir) | ||
|
||
|
||
def main(output_path, dataset_path, cfg): | ||
"""Preprocess librilight dataset""" | ||
n_cpus = cfg.n_cpus # number of cpus to use for preprocessing | ||
n_gpus = cfg.n_gpus # number of gpus to use for transcription | ||
cut_length = cfg.cut_length # target length of utterance in seconds | ||
max_length = cfg.max_length # max length of utterance in seconds | ||
|
||
# MFA files | ||
mfa_config_path = cfg.mfa_config_path # path to mfa config file | ||
mfa_dict_path = cfg.mfa_dict_path # path to mfa dict file | ||
mfa_model_path = cfg.mfa_model_path # path to mfa model file | ||
|
||
# check if mfa files exist | ||
if ( | ||
not os.path.exists(mfa_dict_path) | ||
or not os.path.exists(mfa_model_path) | ||
or not os.path.exists(mfa_config_path) | ||
): | ||
raise Exception("MFA files not found.") | ||
|
||
# Whisper model id | ||
model_id = cfg.whisper_model_id # id of whisper model to use for transcription | ||
|
||
subsets = [ | ||
d | ||
for d in os.listdir(dataset_path) | ||
if ( | ||
os.path.isdir(os.path.join(dataset_path, d)) | ||
and d in ["tiny", "small", "medium", "large"] | ||
) | ||
] | ||
print("Found subsets:", subsets) | ||
|
||
if len(subsets) == 0: | ||
print("No subsets found. Exiting...") | ||
return | ||
# Preprocess each subset | ||
for subset in subsets: | ||
# Construct paths based on the base path | ||
print("Pre-proccessing Libri-light subset:", subset) | ||
raw_dir = f"{dataset_path}/{subset}" | ||
save_dir = f"{output_path}/{subset}" | ||
processed_dir = f"{dataset_path}/processed/{subset}" | ||
os.makedirs(processed_dir, exist_ok=True) | ||
os.makedirs(save_dir, exist_ok=True) | ||
|
||
# Step 1: Segmentation | ||
print("-" * 10) | ||
print("Step 1: Segmentation") | ||
print("Cutting audio files...") | ||
|
||
cut_segments(raw_dir, processed_dir, cut_length, n_cpus) | ||
|
||
# Steps 2 & 3: Filter and Preprocess | ||
print("-" * 10) | ||
print("Step 2 & 3: Filter and Preprocess") | ||
print("Filtering and preprocessing audio files...") | ||
|
||
wav_files = get_wav_files(processed_dir) | ||
filtered_wav_files = filter_wav_files_by_length(wav_files, max_length) | ||
process_wav_files(filtered_wav_files, processed_dir, n_cpus) | ||
|
||
# Step 4 & 5: Transcription & Text-preprocess | ||
print("-" * 10) | ||
print("Step 4 & 5: Transcription & Text-preprocess") | ||
print("Transcribing audio files...") | ||
|
||
n_gpus = min(n_gpus, torch.cuda.device_count()) | ||
asr_main(processed_dir, n_gpus, model_id) | ||
|
||
# Step 6: MFA Align | ||
print("-" * 10) | ||
print("Step 6: MFA Align") | ||
print("Aligning audio files...") | ||
|
||
command = [ | ||
"mfa", | ||
"align", | ||
"-v", | ||
"-j", | ||
str(n_cpus), | ||
"-c", | ||
mfa_config_path, | ||
processed_dir, | ||
mfa_dict_path, | ||
mfa_model_path, | ||
processed_dir, | ||
"--output_format", | ||
"long_textgrid", | ||
"--clean", | ||
"--overwrite", | ||
] | ||
subprocess.run(command, text=True) | ||
|
||
# Step 7: train/dev/eval split | ||
print("-" * 10) | ||
print("Step 7: train/dev/eval split") | ||
print("Splitting dataset by speaker...") | ||
|
||
speakers = split_dataset_by_speaker(processed_dir) | ||
|
||
# Step 8: Statistics | ||
print("-" * 10) | ||
print("Step 8: Statistics") | ||
print("Calculating statistics...") | ||
|
||
distribution2speakers2utts = librilight_statistics(processed_dir) | ||
|
||
# Step 9: Save metadata | ||
print("-" * 10) | ||
print("Step 9: Save metadata") | ||
print("Preparing Metadata for Librilight...") | ||
|
||
save_meta_data(save_dir, processed_dir, distribution2speakers2utts, speakers) | ||
print("Preprocessing subset", subset, "done!") | ||
print("-" * 10) | ||
|
||
|
||
if __name__ == "__main__": | ||
dataset_path = "/path/to/dataset/librilight" | ||
output_path = "/path/to/output" | ||
main(output_path, dataset_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.