Tortoise TTS inference (coqui-ai#2547)

* initial commit * Tortoise inference * revert path change * style fix * remove accidental remove * style fixes * style fixes * removed unwanted assests and deps * remove changes * remove cvvp * style fix black * added tortoise config and updated config and args, refactoring the code * added tortoise to api * Pull mel_norm from url * Use TTS cleaners * Let download model files * add ability to pass tortoise presets through coqui api * fix tests * fix style and tests * fix tts commandline for tortoise * Add config.json to tortoise * Use kwargs * Use regular model api for loading tortoise * Add load from dir to synthesizer * Fix Tortoise floats * Use model_dir when there are multiple urls * Use `synthesize` when exists * lint fixes and resolve preset bug * resolve a download bug and update model link * fix json * do tortoise inference from voice dir * fix * fix test * fix speaker id and remove assests * update inference_tests.yml * replace inference_test.yml * fix extra dir as None * fix tests * remove space * Reformat docstring * Add docs * Update docs * lint fixes --------- Co-authored-by: Eren Gölge <[email protected]> Co-authored-by: Eren Gölge <[email protected]>
artisdom · May 15, 2023 · a3d5801 · a3d5801
1 parent 0b6b957
commit a3d5801
Show file tree

Hide file tree

Showing 31 changed files with 8,298 additions and 35 deletions.
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -52,4 +52,4 @@ jobs:
       - name: Unit tests
         run: make inference_tests
         env:
-            COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
+            COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
diff --git a/TTS/.models.json b/TTS/.models.json
@@ -220,6 +220,26 @@
                     "license": "apache 2.0",
                     "contact": "[email protected]"
                 }
+
+            },
+            "multi-dataset":{
+                "tortoise-v2":{
+                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
+                    "github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
+                                       "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
+                                    ],
+                    "commit": "c1875f6",
+                    "default_vocoder": null,
+                    "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
+                    "license": "apache 2.0"
+                }
             },
             "jenny": {
                 "jenny":{

diff --git a/TTS/api.py b/TTS/api.py
@@ -342,10 +342,14 @@ def list_models():
 
     def download_model_by_name(self, model_name: str):
         model_path, config_path, model_item = self.manager.download_model(model_name)
+        if isinstance(model_item["github_rls_url"], list):
+            # return model directory if there are multiple files
+            # we assume that the model knows how to load itself
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None, None
+            return model_path, config_path, None, None, None
         vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
-        return model_path, config_path, vocoder_path, vocoder_config_path
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
 
     def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
         """Load one of the voice conversion models by name.
@@ -355,7 +359,7 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, _, _ = self.download_model_by_name(model_name)
+        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
         self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
 
     def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
@@ -374,7 +378,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
         if "coqui_studio" in model_name:
             self.csapi = CS_API()
         else:
-            model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+            model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+                model_name
+            )
 
             # init synthesizer
             # None values are fetch from the model
@@ -387,6 +393,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
                 vocoder_config=vocoder_config_path,
                 encoder_checkpoint=None,
                 encoder_config=None,
+                model_dir=model_dir,
                 use_cuda=gpu,
             )
 
@@ -422,6 +429,7 @@ def _check_arguments(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = None,
+        **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
         if not self.is_coqui_studio:
@@ -430,7 +438,7 @@ def _check_arguments(
                 raise ValueError("Model is multi-speaker but no `speaker` is provided.")
             if self.is_multi_lingual and language is None:
                 raise ValueError("Model is multi-lingual but no `language` is provided.")
-            if not self.is_multi_speaker and speaker is not None:
+            if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
                 raise ValueError("Model is not multi-speaker but `speaker` is provided.")
             if not self.is_multi_lingual and language is not None:
                 raise ValueError("Model is not multi-lingual but `language` is provided.")
@@ -499,6 +507,7 @@ def tts(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = None,
+        **kwargs,
     ):
         """Convert text to speech.
 
@@ -520,12 +529,13 @@ def tts(
                 Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
                 Defaults to None.
         """
-        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
+        self._check_arguments(
+            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
+        )
         if self.csapi is not None:
             return self.tts_coqui_studio(
                 text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
             )
-
         wav = self.synthesizer.tts(
             text=text,
             speaker_name=speaker,
@@ -535,6 +545,7 @@ def tts(
             style_wav=None,
             style_text=None,
             reference_speaker_name=None,
+            **kwargs,
         )
         return wav
 
@@ -547,6 +558,7 @@ def tts_to_file(
         emotion: str = "Neutral",
         speed: float = 1.0,
         file_path: str = "output.wav",
+        **kwargs,
     ):
         """Convert text to speech.
 
@@ -569,13 +581,13 @@ def tts_to_file(
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
         """
-        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
 
         if self.csapi is not None:
             return self.tts_coqui_studio(
                 text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
             )
-        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
+        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
         self.synthesizer.save_wav(wav=wav, path=file_path)
         return file_path
 

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -274,6 +274,13 @@ def main():
         help="Target audio file to convert in the voice of the source_wav",
     )
 
+    parser.add_argument(
+        "--voice_dir",
+        type=str,
+        default=None,
+        help="Voice dir for tortoise model",
+    )
+
     args = parser.parse_args()
 
     # print the description if either text or list_models is not set
@@ -306,6 +313,7 @@ def main():
     encoder_config_path = None
     vc_path = None
     vc_config_path = None
+    model_dir = None
 
     # CASE1 #list : list pre-trained TTS models
     if args.list_models:
@@ -335,7 +343,6 @@ def main():
     # CASE4: load pre-trained model paths
     if args.model_name is not None and not args.model_path:
         model_path, config_path, model_item = manager.download_model(args.model_name)
-
         # tts model
         if model_item["model_type"] == "tts_models":
             tts_path = model_path
@@ -348,6 +355,13 @@ def main():
             vc_path = model_path
             vc_config_path = config_path
 
+        # tts model with multiple files to be loaded from the directory path
+        if isinstance(model_item["github_rls_url"], list):
+            model_dir = model_path
+            tts_path = None
+            tts_config_path = None
+            args.vocoder_name = None
+
     # load vocoder
     if args.vocoder_name is not None and not args.vocoder_path:
         vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
@@ -379,6 +393,8 @@ def main():
         encoder_config_path,
         vc_path,
         vc_config_path,
+        model_dir,
+        args.voice_dir,
         args.use_cuda,
     )
 
@@ -427,6 +443,8 @@ def main():
             source_wav=args.source_wav,
             target_wav=args.target_wav,
         )
+    elif model_dir is not None:
+        wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))

diff --git a/TTS/tts/configs/tortoise_config.py b/TTS/tts/configs/tortoise_config.py
@@ -0,0 +1,87 @@
+from dataclasses import dataclass, field
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
+
+
+@dataclass
+class TortoiseConfig(BaseTTSConfig):
+    """Defines parameters for Tortoise TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (TortoiseArgs):
+            Model architecture arguments. Defaults to `TortoiseArgs()`.
+
+        audio (TortoiseAudioConfig):
+            Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
+
+        model_dir (str):
+            Path to the folder that has all the Tortoise models. Defaults to None.
+
+        temperature (float):
+            Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+        length_penalty (float):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+            which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+            length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+        reperation_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+        top_p (float):
+            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            Defaults to `0.8`.
+
+        cond_free_k (float):
+            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
+
+        diffusion_temperature (float):
+            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+            are the "mean" prediction of the diffusion network and will sound bland and smeared.
+            Defaults to `1.0`.
+
+        num_autoregressive_samples (int):
+            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+            As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+            Defaults to `16`.
+
+        diffusion_iterations (int):
+            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+            however. Defaults to `30`.
+
+        sampler (str):
+            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.tts.configs.tortoise_config import TortoiseConfig
+        >>> config = TortoiseConfig()
+    """
+
+    model: str = "tortoise"
+    # model specific params
+    model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
+    audio: TortoiseAudioConfig = TortoiseAudioConfig()
+    model_dir: str = None
+
+    # settings
+    temperature: float = 0.2
+    length_penalty: float = 1.0
+    repetition_penalty: float = 2.0
+    top_p: float = 0.8
+    cond_free_k: float = 2.0
+    diffusion_temperature: float = 1.0
+
+    # inference params
+    num_autoregressive_samples: int = 16
+    diffusion_iterations: int = 30
+    sampler: str = "ddim"