Skip to content

Commit

Permalink
Make text splitting optional
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol committed Nov 27, 2023
1 parent 3b8894a commit b75e90b
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 7 deletions.
48 changes: 44 additions & 4 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def tts(
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
Expand All @@ -346,6 +347,12 @@ def tts(
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
Expand All @@ -363,6 +370,7 @@ def tts(
style_wav=None,
style_text=None,
reference_speaker_name=None,
split_sentences=split_sentences,
**kwargs,
)
return wav
Expand All @@ -377,6 +385,7 @@ def tts_to_file(
speed: float = 1.0,
pipe_out=None,
file_path: str = "output.wav",
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
Expand All @@ -401,6 +410,10 @@ def tts_to_file(
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Output file path. Defaults to "output.wav".
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
Expand All @@ -416,7 +429,14 @@ def tts_to_file(
file_path=file_path,
pipe_out=pipe_out,
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
wav = self.tts(
text=text,
speaker=speaker,
language=language,
speaker_wav=speaker_wav,
split_sentences=split_sentences,
**kwargs,
)
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
return file_path

Expand Down Expand Up @@ -456,7 +476,14 @@ def voice_conversion_to_file(
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path

def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
def tts_with_vc(
self,
text: str,
language: str = None,
speaker_wav: str = None,
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion.
It combines tts with voice conversion to fake voice cloning.
Expand All @@ -476,10 +503,16 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None,
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
self.tts_to_file(
text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
Expand All @@ -492,6 +525,7 @@ def tts_with_vc_to_file(
speaker_wav: str = None,
file_path: str = "output.wav",
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion and save to file.
Expand All @@ -511,6 +545,12 @@ def tts_with_vc_to_file(
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
wav = self.tts_with_vc(
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
1 change: 0 additions & 1 deletion TTS/utils/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ def _set_model_item(self, model_name):
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
],
}
print(model_item)
else:
# get model from models.json
model_type, lang, dataset, model = model_name.split("/")
Expand Down
9 changes: 7 additions & 2 deletions TTS/utils/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def tts(
style_text=None,
reference_wav=None,
reference_speaker_name=None,
split_sentences: bool = True,
**kwargs,
) -> List[int]:
"""🐸 TTS magic. Run all the models and generate speech.
Expand All @@ -277,6 +278,8 @@ def tts(
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
split_sentences (bool, optional): split the input text into sentences. Defaults to True.
**kwargs: additional arguments to pass to the TTS model.
Returns:
List[int]: [description]
"""
Expand All @@ -289,8 +292,10 @@ def tts(
)

if text:
sens = self.split_into_sentences(text)
print(" > Text splitted to sentences.")
sens = [text]
if split_sentences:
print(" > Text splitted to sentences.")
sens = self.split_into_sentences(text)
print(sens)

# handle multi-speaker
Expand Down

0 comments on commit b75e90b

Please sign in to comment.