forked from coqui-ai/TTS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tortoise TTS inference (coqui-ai#2547)
* initial commit * Tortoise inference * revert path change * style fix * remove accidental remove * style fixes * style fixes * removed unwanted assests and deps * remove changes * remove cvvp * style fix black * added tortoise config and updated config and args, refactoring the code * added tortoise to api * Pull mel_norm from url * Use TTS cleaners * Let download model files * add ability to pass tortoise presets through coqui api * fix tests * fix style and tests * fix tts commandline for tortoise * Add config.json to tortoise * Use kwargs * Use regular model api for loading tortoise * Add load from dir to synthesizer * Fix Tortoise floats * Use model_dir when there are multiple urls * Use `synthesize` when exists * lint fixes and resolve preset bug * resolve a download bug and update model link * fix json * do tortoise inference from voice dir * fix * fix test * fix speaker id and remove assests * update inference_tests.yml * replace inference_test.yml * fix extra dir as None * fix tests * remove space * Reformat docstring * Add docs * Update docs * lint fixes --------- Co-authored-by: Eren Gölge <[email protected]> Co-authored-by: Eren Gölge <[email protected]>
- Loading branch information
1 parent
0b6b957
commit a3d5801
Showing
31 changed files
with
8,298 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -220,6 +220,26 @@ | |
"license": "apache 2.0", | ||
"contact": "[email protected]" | ||
} | ||
|
||
}, | ||
"multi-dataset":{ | ||
"tortoise-v2":{ | ||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", | ||
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", | ||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" | ||
], | ||
"commit": "c1875f6", | ||
"default_vocoder": null, | ||
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", | ||
"license": "apache 2.0" | ||
} | ||
}, | ||
"jenny": { | ||
"jenny":{ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from dataclasses import dataclass, field | ||
|
||
from TTS.tts.configs.shared_configs import BaseTTSConfig | ||
from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig | ||
|
||
|
||
@dataclass | ||
class TortoiseConfig(BaseTTSConfig): | ||
"""Defines parameters for Tortoise TTS model. | ||
Args: | ||
model (str): | ||
Model name. Do not change unless you know what you are doing. | ||
model_args (TortoiseArgs): | ||
Model architecture arguments. Defaults to `TortoiseArgs()`. | ||
audio (TortoiseAudioConfig): | ||
Audio processing configuration. Defaults to `TortoiseAudioConfig()`. | ||
model_dir (str): | ||
Path to the folder that has all the Tortoise models. Defaults to None. | ||
temperature (float): | ||
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`. | ||
length_penalty (float): | ||
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, | ||
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), | ||
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. | ||
reperation_penalty (float): | ||
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`. | ||
top_p (float): | ||
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. | ||
Defaults to `0.8`. | ||
cond_free_k (float): | ||
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. | ||
As cond_free_k increases, the output becomes dominated by the conditioning-free signal. | ||
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`. | ||
diffusion_temperature (float): | ||
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 | ||
are the "mean" prediction of the diffusion network and will sound bland and smeared. | ||
Defaults to `1.0`. | ||
num_autoregressive_samples (int): | ||
Number of samples taken from the autoregressive model, all of which are filtered using CLVP. | ||
As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great". | ||
Defaults to `16`. | ||
diffusion_iterations (int): | ||
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine | ||
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, | ||
however. Defaults to `30`. | ||
sampler (str): | ||
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`. | ||
Note: | ||
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. | ||
Example: | ||
>>> from TTS.tts.configs.tortoise_config import TortoiseConfig | ||
>>> config = TortoiseConfig() | ||
""" | ||
|
||
model: str = "tortoise" | ||
# model specific params | ||
model_args: TortoiseArgs = field(default_factory=TortoiseArgs) | ||
audio: TortoiseAudioConfig = TortoiseAudioConfig() | ||
model_dir: str = None | ||
|
||
# settings | ||
temperature: float = 0.2 | ||
length_penalty: float = 1.0 | ||
repetition_penalty: float = 2.0 | ||
top_p: float = 0.8 | ||
cond_free_k: float = 2.0 | ||
diffusion_temperature: float = 1.0 | ||
|
||
# inference params | ||
num_autoregressive_samples: int = 16 | ||
diffusion_iterations: int = 30 | ||
sampler: str = "ddim" |
Oops, something went wrong.