From 48e1e66b1206fda436f8cbb3cc851b618c4b936d Mon Sep 17 00:00:00 2001 From: juan Date: Wed, 15 Jan 2025 19:07:11 -0500 Subject: [PATCH 1/3] added style and style degree --- .../livekit/plugins/azure/tts.py | 71 +++++++++++++++---- 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py index ac77ecfbf..3f2acb242 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py @@ -103,6 +103,26 @@ def __post_init__(self): self.validate() +@dataclass +class StyleConfig: + """ + Style configuration for Azure TTS neural voices. + + Args: + style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc. + degree: Intensity of the style, from 0.1 to 2.0. + """ + style: str + degree: float | None = None + + def validate(self) -> None: + if self.degree is not None and not 0.1 <= self.degree <= 2.0: + raise ValueError("Style degree must be between 0.1 and 2.0") + + def __post_init__(self): + self.validate() + + @dataclass class _TTSOptions: sample_rate: int @@ -121,6 +141,7 @@ class _TTSOptions: # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#adjust-prosody prosody: ProsodyConfig | None = None speech_endpoint: str | None = None + style: StyleConfig | None = None class TTS(tts.TTS): @@ -136,6 +157,7 @@ def __init__( speech_host: str | None = None, speech_auth_token: str | None = None, endpoint_id: str | None = None, + style: StyleConfig | None = None, ) -> None: """ Create a new instance of Azure TTS. @@ -176,6 +198,9 @@ def __init__( if prosody: prosody.validate() + if style: + style.validate() + self._opts = _TTSOptions( sample_rate=sample_rate, speech_key=speech_key, @@ -186,6 +211,7 @@ def __init__( endpoint_id=endpoint_id, language=language, prosody=prosody, + style=style, ) def update_options( @@ -194,10 +220,12 @@ def update_options( voice: str | None = None, language: str | None = None, prosody: ProsodyConfig | None = None, + style: StyleConfig | None = None, ) -> None: self._opts.voice = voice or self._opts.voice self._opts.language = language or self._opts.language self._opts.prosody = prosody or self._opts.prosody + self._opts.style = style or self._opts.style def synthesize( self, @@ -234,22 +262,35 @@ async def _run(self): ) def _synthesize() -> speechsdk.SpeechSynthesisResult: - if self._opts.prosody: + if self._opts.prosody or self._opts.style: ssml = f'' - prosody_ssml = f'' + + # Add style if specified + if self._opts.style: + style_degree = f' styledegree="{self._opts.style.degree}"' if self._opts.style.degree else '' + ssml += f'' + + # Add prosody if specified + if self._opts.prosody: + ssml += ' Date: Wed, 15 Jan 2025 19:12:57 -0500 Subject: [PATCH 2/3] fixed mstts --- .../livekit-plugins-azure/livekit/plugins/azure/tts.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py index 3f2acb242..c34f4d0fd 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py @@ -263,7 +263,12 @@ async def _run(self): def _synthesize() -> speechsdk.SpeechSynthesisResult: if self._opts.prosody or self._opts.style: - ssml = f'' + ssml = ( + '' + ) ssml += f'' # Add style if specified From ef2896d84e008a80156d1f28b5f2b4212a06f9e7 Mon Sep 17 00:00:00 2001 From: David Zhao Date: Mon, 20 Jan 2025 02:14:30 -0600 Subject: [PATCH 3/3] fix formatting --- .../livekit/plugins/azure/tts.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py index c34f4d0fd..46ce53219 100644 --- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py +++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py @@ -112,6 +112,7 @@ class StyleConfig: style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc. degree: Intensity of the style, from 0.1 to 2.0. """ + style: str degree: float | None = None @@ -270,15 +271,19 @@ def _synthesize() -> speechsdk.SpeechSynthesisResult: f'xml:lang="{self._opts.language or "en-US"}">' ) ssml += f'' - + # Add style if specified if self._opts.style: - style_degree = f' styledegree="{self._opts.style.degree}"' if self._opts.style.degree else '' + style_degree = ( + f' styledegree="{self._opts.style.degree}"' + if self._opts.style.degree + else "" + ) ssml += f'' - + # Add prosody if specified if self._opts.prosody: - ssml += ' speechsdk.SpeechSynthesisResult: ssml += "" else: ssml += self._input_text - + # Close style tag if it was opened if self._opts.style: ssml += "" - + ssml += "" return synthesizer.speak_ssml_async(ssml).get() # type: ignore