Skip to content

Commit

Permalink
Tortoise TTS inference (coqui-ai#2547)
Browse files Browse the repository at this point in the history
* initial commit

* Tortoise inference

* revert path change

* style fix

* remove accidental remove

* style fixes

* style fixes

* removed unwanted assests and deps

* remove changes

* remove cvvp

* style fix black

* added tortoise config and updated config and args, refactoring the code

* added tortoise to api

* Pull mel_norm from url

* Use TTS cleaners

* Let download model files

* add ability to pass tortoise presets through coqui api

* fix tests

* fix style and tests

* fix tts commandline for tortoise

* Add config.json to tortoise

* Use kwargs

* Use regular model api for loading tortoise

* Add load from dir to synthesizer

* Fix Tortoise floats

* Use model_dir when there are multiple urls

* Use `synthesize` when exists

* lint fixes and resolve preset bug

* resolve a download bug and update model link

* fix json

* do tortoise inference from voice dir

* fix

* fix test

* fix speaker id and remove assests

* update inference_tests.yml

* replace inference_test.yml

* fix extra dir as None

* fix tests

* remove space

* Reformat docstring

* Add docs

* Update docs

* lint fixes

---------

Co-authored-by: Eren Gölge <[email protected]>
Co-authored-by: Eren Gölge <[email protected]>
  • Loading branch information
3 people authored May 15, 2023
1 parent 0b6b957 commit a3d5801
Show file tree
Hide file tree
Showing 31 changed files with 8,298 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/inference_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@ jobs:
- name: Unit tests
run: make inference_tests
env:
COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
20 changes: 20 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,26 @@
"license": "apache 2.0",
"contact": "[email protected]"
}

},
"multi-dataset":{
"tortoise-v2":{
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"commit": "c1875f6",
"default_vocoder": null,
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
"license": "apache 2.0"
}
},
"jenny": {
"jenny":{
Expand Down
30 changes: 21 additions & 9 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,10 +342,14 @@ def list_models():

def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if isinstance(model_item["github_rls_url"], list):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
if model_item.get("default_vocoder") is None:
return model_path, config_path, None, None
return model_path, config_path, None, None, None
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
return model_path, config_path, vocoder_path, vocoder_config_path
return model_path, config_path, vocoder_path, vocoder_config_path, None

def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the voice conversion models by name.
Expand All @@ -355,7 +359,7 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.model_name = model_name
model_path, config_path, _, _ = self.download_model_by_name(model_name)
model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)

def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
Expand All @@ -374,7 +378,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
if "coqui_studio" in model_name:
self.csapi = CS_API()
else:
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name
)

# init synthesizer
# None values are fetch from the model
Expand All @@ -387,6 +393,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)

Expand Down Expand Up @@ -422,6 +429,7 @@ def _check_arguments(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
**kwargs,
) -> None:
"""Check if the arguments are valid for the model."""
if not self.is_coqui_studio:
Expand All @@ -430,7 +438,7 @@ def _check_arguments(
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None:
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
Expand Down Expand Up @@ -499,6 +507,7 @@ def tts(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
**kwargs,
):
"""Convert text to speech.
Expand All @@ -520,12 +529,13 @@ def tts(
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
)
if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
)

wav = self.synthesizer.tts(
text=text,
speaker_name=speaker,
Expand All @@ -535,6 +545,7 @@ def tts(
style_wav=None,
style_text=None,
reference_speaker_name=None,
**kwargs,
)
return wav

Expand All @@ -547,6 +558,7 @@ def tts_to_file(
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = "output.wav",
**kwargs,
):
"""Convert text to speech.
Expand All @@ -569,13 +581,13 @@ def tts_to_file(
file_path (str, optional):
Output file path. Defaults to "output.wav".
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
self.synthesizer.save_wav(wav=wav, path=file_path)
return file_path

Expand Down
20 changes: 19 additions & 1 deletion TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,13 @@ def main():
help="Target audio file to convert in the voice of the source_wav",
)

parser.add_argument(
"--voice_dir",
type=str,
default=None,
help="Voice dir for tortoise model",
)

args = parser.parse_args()

# print the description if either text or list_models is not set
Expand Down Expand Up @@ -306,6 +313,7 @@ def main():
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None

# CASE1 #list : list pre-trained TTS models
if args.list_models:
Expand Down Expand Up @@ -335,7 +343,6 @@ def main():
# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)

# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
Expand All @@ -348,6 +355,13 @@ def main():
vc_path = model_path
vc_config_path = config_path

# tts model with multiple files to be loaded from the directory path
if isinstance(model_item["github_rls_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None

# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
Expand Down Expand Up @@ -379,6 +393,8 @@ def main():
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
args.use_cuda,
)

Expand Down Expand Up @@ -427,6 +443,8 @@ def main():
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)

# save the results
print(" > Saving output to {}".format(args.out_path))
Expand Down
87 changes: 87 additions & 0 deletions TTS/tts/configs/tortoise_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from dataclasses import dataclass, field

from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig


@dataclass
class TortoiseConfig(BaseTTSConfig):
"""Defines parameters for Tortoise TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (TortoiseArgs):
Model architecture arguments. Defaults to `TortoiseArgs()`.
audio (TortoiseAudioConfig):
Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
model_dir (str):
Path to the folder that has all the Tortoise models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
reperation_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_autoregressive_samples (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
diffusion_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.tortoise_config import TortoiseConfig
>>> config = TortoiseConfig()
"""

model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = TortoiseAudioConfig()
model_dir: str = None

# settings
temperature: float = 0.2
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_p: float = 0.8
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0

# inference params
num_autoregressive_samples: int = 16
diffusion_iterations: int = 30
sampler: str = "ddim"
Loading

0 comments on commit a3d5801

Please sign in to comment.