Skip to content

Commit

Permalink
Merge branch 'pr/Edresson/731-rebased' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol committed Jan 1, 2022
2 parents 8d2bb28 + 33711af commit d37cfe4
Show file tree
Hide file tree
Showing 55 changed files with 2,623 additions and 321 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ core
recipes/WIP/*
recipes/ljspeech/LJSpeech-1.1/*
recipes/vctk/VCTK/*
recipes/**/*.npy
recipes/**/*.json
VCTK-Corpus-removed-silence/*

# ignore training logs
Expand Down Expand Up @@ -161,4 +163,5 @@ speakers.json
internal/*
*_pitch.npy
*_phoneme.npy
wandb
wandb
depot/*
12 changes: 12 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
{
"tts_models": {
"multilingual":{
"multi-dataset":{
"your_tts":{
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",
"contact": "[email protected]"
}
}
},
"en": {
"ek1": {
"tacotron2": {
Expand Down
15 changes: 10 additions & 5 deletions TTS/bin/extract_tts_spectrograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from TTS.config import load_config
from TTS.tts.datasets import TTSDataset, load_tts_samples
from TTS.tts.models import setup_model
from TTS.tts.utils.speakers import get_speaker_manager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import count_parameters

Expand All @@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False):
enable_eos_bos=c.enable_eos_bos_chars,
use_noise_augment=False,
verbose=verbose,
speaker_id_mapping=speaker_manager.speaker_ids,
d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None,
speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None,
d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None,
)

if c.use_phonemes and c.compute_input_seq_cache:
Expand Down Expand Up @@ -234,8 +234,13 @@ def main(args): # pylint: disable=redefined-outer-name
# use eval and training partitions
meta_data = meta_data_train + meta_data_eval

# parse speakers
speaker_manager = get_speaker_manager(c, args, meta_data_train)
# init speaker manager
if c.use_speaker_embedding:
speaker_manager = SpeakerManager(data_items=meta_data)
elif c.use_d_vector_file:
speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
else:
speaker_manager = None

# setup model
model = setup_model(c)
Expand Down
62 changes: 62 additions & 0 deletions TTS/bin/find_unique_phonemes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Find all the unique characters in a dataset"""
import argparse
import multiprocessing
from argparse import RawTextHelpFormatter

from tqdm.contrib.concurrent import process_map

from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text import text2phone


def compute_phonemes(item):
try:
text = item[0]
language = item[-1]
ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|")
except:
return []
return list(set(ph))


def main():
# pylint: disable=W0601
global c
# pylint: disable=bad-option-value
parser = argparse.ArgumentParser(
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/find_unique_chars.py --config_path config.json
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
args = parser.parse_args()

c = load_config(args.config_path)

# load all datasets
train_items, eval_items = load_tts_samples(c.datasets, eval_split=True)
items = train_items + eval_items
print("Num items:", len(items))

phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
phones = []
for ph in phonemes:
phones.extend(ph)
phones = set(phones)
lower_phones = filter(lambda c: c.islower(), phones)
phones_force_lower = [c.lower() for c in phones]
phones_force_lower = set(phones_force_lower)

print(f" > Number of unique phonemes: {len(phones)}")
print(f" > Unique phonemes: {''.join(sorted(phones))}")
print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")


if __name__ == "__main__":
main()
89 changes: 89 additions & 0 deletions TTS/bin/remove_silence_using_vad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import argparse
import glob
import multiprocessing
import os
import pathlib

from tqdm.contrib.concurrent import process_map

from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave


def remove_silence(filepath):
output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
# ignore if the file exists
if os.path.exists(output_path) and not args.force:
return

# create all directory structure
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# load wave
audio, sample_rate = read_wave(filepath)

# get speech segments
segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness)

segments = list(segments)
num_segments = len(segments)
flag = False
# create the output wave
if num_segments != 0:
for i, segment in reversed(list(enumerate(segments))):
if i >= 1:
if not flag:
concat_segment = segment
flag = True
else:
concat_segment = segment + concat_segment
else:
if flag:
segment = segment + concat_segment
# print("Saving: ", output_path)
write_wave(output_path, segment, sample_rate)
return
else:
print("> Just Copying the file to:", output_path)
# if fail to remove silence just write the file
write_wave(output_path, audio, sample_rate)
return


def preprocess_audios():
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
print("> Number of files: ", len(files))
if not args.force:
print("> Ignoring files that already exist in the output directory.")

if files:
# create threads
num_threads = multiprocessing.cpu_count()
process_map(remove_silence, files, max_workers=num_threads, chunksize=15)
else:
print("> No files Found !")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2"
)
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
parser.add_argument(
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
)
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
parser.add_argument(
"-g",
"--glob",
type=str,
default="**/*.wav",
help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
)
parser.add_argument(
"-a",
"--aggressiveness",
type=int,
default=2,
help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.",
)
args = parser.parse_args()
preprocess_audios()
30 changes: 28 additions & 2 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,19 @@ def main():

# args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
parser.add_argument(
"--speaker_idx",
type=str,
help="Target speaker ID for a multi-speaker TTS model.",
default=None,
)
parser.add_argument(
"--language_idx",
type=str,
help="Target language ID for a multi-lingual TTS model.",
default=None,
)
parser.add_argument(
"--speaker_wav",
nargs="+",
Expand All @@ -173,6 +180,14 @@ def main():
const=True,
default=False,
)
parser.add_argument(
"--list_language_idxs",
help="List available language ids for the defined multi-lingual model.",
type=str2bool,
nargs="?",
const=True,
default=False,
)
# aux args
parser.add_argument(
"--save_spectogram",
Expand All @@ -184,7 +199,7 @@ def main():
args = parser.parse_args()

# print the description if either text or list_models is not set
if args.text is None and not args.list_models and not args.list_speaker_idxs:
if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs:
parser.parse_args(["-h"])

# load model manager
Expand All @@ -194,6 +209,7 @@ def main():
model_path = None
config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
Expand All @@ -217,6 +233,7 @@ def main():
model_path = args.model_path
config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path

if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
Expand All @@ -231,6 +248,7 @@ def main():
model_path,
config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
Expand All @@ -246,6 +264,14 @@ def main():
print(synthesizer.tts_model.speaker_manager.speaker_ids)
return

# query langauge ids of a multi-lingual model.
if args.list_language_idxs:
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.language_id_mapping)
return

# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
Expand All @@ -258,7 +284,7 @@ def main():
print(" > Text: {}".format(args.text))

# kick it
wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style)
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)

# save the results
print(" > Saving output to {}".format(args.out_path))
Expand Down
4 changes: 2 additions & 2 deletions TTS/bin/train_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model
from TTS.speaker_encoder.utils.training import init_training
from TTS.speaker_encoder.utils.visual import plot_embeddings
from TTS.tts.datasets import load_tts_samples
Expand Down Expand Up @@ -151,7 +151,7 @@ def main(args): # pylint: disable=redefined-outer-name
global meta_data_eval

ap = AudioProcessor(**c.audio)
model = setup_model(c)
model = setup_speaker_encoder_model(c)

optimizer = RAdam(model.parameters(), lr=c.lr)

Expand Down
28 changes: 23 additions & 5 deletions TTS/bin/train_tts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os

from TTS.config import load_config, register_config
from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models import setup_model
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor

Expand Down Expand Up @@ -45,15 +46,32 @@ def main():
ap = AudioProcessor(**config.audio)

# init speaker manager
if config.use_speaker_embedding:
if check_config_and_model_args(config, "use_speaker_embedding", True):
speaker_manager = SpeakerManager(data_items=train_samples + eval_samples)
elif config.use_d_vector_file:
speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
if hasattr(config, "model_args"):
config.model_args.num_speakers = speaker_manager.num_speakers
else:
config.num_speakers = speaker_manager.num_speakers
elif check_config_and_model_args(config, "use_d_vector_file", True):
speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file"))
if hasattr(config, "model_args"):
config.model_args.num_speakers = speaker_manager.num_speakers
else:
config.num_speakers = speaker_manager.num_speakers
else:
speaker_manager = None

if hasattr(config, "use_language_embedding") and config.use_language_embedding:
language_manager = LanguageManager(config=config)
if hasattr(config, "model_args"):
config.model_args.num_languages = language_manager.num_languages
else:
config.num_languages = language_manager.num_languages
else:
language_manager = None

# init the model from config
model = setup_model(config, speaker_manager)
model = setup_model(config, speaker_manager, language_manager)

# init the trainer and 🚀
trainer = Trainer(
Expand Down
Loading

0 comments on commit d37cfe4

Please sign in to comment.