From 48e1e66b1206fda436f8cbb3cc851b618c4b936d Mon Sep 17 00:00:00 2001
From: juan <jmugicagonz@hotmail.com>
Date: Wed, 15 Jan 2025 19:07:11 -0500
Subject: [PATCH 1/3] added style and style degree

---
 .../livekit/plugins/azure/tts.py              | 71 +++++++++++++++----
 1 file changed, 56 insertions(+), 15 deletions(-)
diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
index ac77ecfbf..3f2acb242 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
@@ -103,6 +103,26 @@ def __post_init__(self):
         self.validate()
 
 
+@dataclass
+class StyleConfig:
+    """
+    Style configuration for Azure TTS neural voices.
+
+    Args:
+        style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc.
+        degree: Intensity of the style, from 0.1 to 2.0.
+    """
+    style: str
+    degree: float | None = None
+
+    def validate(self) -> None:
+        if self.degree is not None and not 0.1 <= self.degree <= 2.0:
+            raise ValueError("Style degree must be between 0.1 and 2.0")
+
+    def __post_init__(self):
+        self.validate()
+
+
 @dataclass
 class _TTSOptions:
     sample_rate: int
@@ -121,6 +141,7 @@ class _TTSOptions:
     # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#adjust-prosody
     prosody: ProsodyConfig | None = None
     speech_endpoint: str | None = None
+    style: StyleConfig | None = None
 
 
 class TTS(tts.TTS):
@@ -136,6 +157,7 @@ def __init__(
         speech_host: str | None = None,
         speech_auth_token: str | None = None,
         endpoint_id: str | None = None,
+        style: StyleConfig | None = None,
     ) -> None:
         """
         Create a new instance of Azure TTS.
@@ -176,6 +198,9 @@ def __init__(
         if prosody:
             prosody.validate()
 
+        if style:
+            style.validate()
+
         self._opts = _TTSOptions(
             sample_rate=sample_rate,
             speech_key=speech_key,
@@ -186,6 +211,7 @@ def __init__(
             endpoint_id=endpoint_id,
             language=language,
             prosody=prosody,
+            style=style,
         )
 
     def update_options(
@@ -194,10 +220,12 @@ def update_options(
         voice: str | None = None,
         language: str | None = None,
         prosody: ProsodyConfig | None = None,
+        style: StyleConfig | None = None,
     ) -> None:
         self._opts.voice = voice or self._opts.voice
         self._opts.language = language or self._opts.language
         self._opts.prosody = prosody or self._opts.prosody
+        self._opts.style = style or self._opts.style
 
     def synthesize(
         self,
@@ -234,22 +262,35 @@ async def _run(self):
         )
 
         def _synthesize() -> speechsdk.SpeechSynthesisResult:
-            if self._opts.prosody:
+            if self._opts.prosody or self._opts.style:
                 ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{self._opts.language or "en-US"}">'
-                prosody_ssml = f'<voice name="{self._opts.voice}"><prosody'
-                if self._opts.prosody.rate:
-                    prosody_ssml += f' rate="{self._opts.prosody.rate}"'
-
-                if self._opts.prosody.volume:
-                    prosody_ssml += f' volume="{self._opts.prosody.volume}"'
-
-                if self._opts.prosody.pitch:
-                    prosody_ssml += f' pitch="{self._opts.prosody.pitch}"'
-
-                prosody_ssml += ">"
-                ssml += prosody_ssml
-                ssml += self._input_text
-                ssml += "</prosody></voice></speak>"
+                ssml += f'<voice name="{self._opts.voice}">'
+                
+                # Add style if specified
+                if self._opts.style:
+                    style_degree = f' styledegree="{self._opts.style.degree}"' if self._opts.style.degree else ''
+                    ssml += f'<mstts:express-as style="{self._opts.style.style}"{style_degree}>'
+                
+                # Add prosody if specified
+                if self._opts.prosody:
+                    ssml += '<prosody'
+                    if self._opts.prosody.rate:
+                        ssml += f' rate="{self._opts.prosody.rate}"'
+                    if self._opts.prosody.volume:
+                        ssml += f' volume="{self._opts.prosody.volume}"'
+                    if self._opts.prosody.pitch:
+                        ssml += f' pitch="{self._opts.prosody.pitch}"'
+                    ssml += ">"
+                    ssml += self._input_text
+                    ssml += "</prosody>"
+                else:
+                    ssml += self._input_text
+                
+                # Close style tag if it was opened
+                if self._opts.style:
+                    ssml += "</mstts:express-as>"
+                    
+                ssml += "</voice></speak>"
                 return synthesizer.speak_ssml_async(ssml).get()  # type: ignore
 
             return synthesizer.speak_text_async(self.input_text).get()  # type: ignore

From cf4e48b4468119d4195e16ec4140ab7a232b0052 Mon Sep 17 00:00:00 2001
From: juan <jmugicagonz@hotmail.com>
Date: Wed, 15 Jan 2025 19:12:57 -0500
Subject: [PATCH 2/3] fixed mstts

---
 .../livekit-plugins-azure/livekit/plugins/azure/tts.py     | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
index 3f2acb242..c34f4d0fd 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
@@ -263,7 +263,12 @@ async def _run(self):
 
         def _synthesize() -> speechsdk.SpeechSynthesisResult:
             if self._opts.prosody or self._opts.style:
-                ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{self._opts.language or "en-US"}">'
+                ssml = (
+                    '<speak version="1.0" '
+                    'xmlns="http://www.w3.org/2001/10/synthesis" '
+                    'xmlns:mstts="http://www.w3.org/2001/mstts" '
+                    f'xml:lang="{self._opts.language or "en-US"}">'
+                )
                 ssml += f'<voice name="{self._opts.voice}">'
                 
                 # Add style if specified

From ef2896d84e008a80156d1f28b5f2b4212a06f9e7 Mon Sep 17 00:00:00 2001
From: David Zhao <dz@livekit.io>
Date: Mon, 20 Jan 2025 02:14:30 -0600
Subject: [PATCH 3/3] fix formatting

---
 .../livekit/plugins/azure/tts.py                | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
index c34f4d0fd..46ce53219 100644
--- a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
+++ b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
@@ -112,6 +112,7 @@ class StyleConfig:
         style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc.
         degree: Intensity of the style, from 0.1 to 2.0.
     """
+
     style: str
     degree: float | None = None
 
@@ -270,15 +271,19 @@ def _synthesize() -> speechsdk.SpeechSynthesisResult:
                     f'xml:lang="{self._opts.language or "en-US"}">'
                 )
                 ssml += f'<voice name="{self._opts.voice}">'
-                
+
                 # Add style if specified
                 if self._opts.style:
-                    style_degree = f' styledegree="{self._opts.style.degree}"' if self._opts.style.degree else ''
+                    style_degree = (
+                        f' styledegree="{self._opts.style.degree}"'
+                        if self._opts.style.degree
+                        else ""
+                    )
                     ssml += f'<mstts:express-as style="{self._opts.style.style}"{style_degree}>'
-                
+
                 # Add prosody if specified
                 if self._opts.prosody:
-                    ssml += '<prosody'
+                    ssml += "<prosody"
                     if self._opts.prosody.rate:
                         ssml += f' rate="{self._opts.prosody.rate}"'
                     if self._opts.prosody.volume:
@@ -290,11 +295,11 @@ def _synthesize() -> speechsdk.SpeechSynthesisResult:
                     ssml += "</prosody>"
                 else:
                     ssml += self._input_text
-                
+
                 # Close style tag if it was opened
                 if self._opts.style:
                     ssml += "</mstts:express-as>"
-                    
+
                 ssml += "</voice></speak>"
                 return synthesizer.speak_ssml_async(ssml).get()  # type: ignore