multiple improvements, see notes

- updated xtts - bf16_allowed renamed to allow_bf16 to conform with other naming - musicgen endpoint has moved to /txt2wav/musicgen - musicgen is now powered by the official audiocraft repo - fixed musicgen format=mp3 output - improved silence detection in voice conversations
monofy-org · Sep 30, 2024 · 2b19c18 · 2b19c18
1 parent 4c19944
commit 2b19c18
Show file tree

Hide file tree

Showing 18 changed files with 510 additions and 433 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -67,3 +67,6 @@
 [submodule "submodules/piano_transcription_inference"]
 	path = submodules/piano_transcription_inference
 	url = https://github.com/monofy-org/piano_transcription_inference
+[submodule "submodules/stable_audio_tools"]
+	path = submodules/stable_audio_tools
+	url = https://github.com/monofy-org/stable-audio-tools
diff --git a/modules/plugins.py b/modules/plugins.py
@@ -53,7 +53,7 @@ def load_plugins():
     from plugins.experimental.img_upres import ImgUpresPlugin
     from plugins.img2txt_moondream import Img2TxtMoondreamPlugin
     from plugins.img2txt_llava import Img2TxtLlavaPlugin
-    from plugins.musicgen import MusicGenPlugin
+    from plugins.txt2wav_musicgen import Txt2WavMusicGenPlugin
     from plugins.exllamav2 import ExllamaV2Plugin
     from plugins.experimental.causal_lm import CausalLMPlugin
     from plugins.txt2model_shap_e import Txt2ModelShapEPlugin
@@ -126,7 +126,7 @@ def load_plugins():
     register_plugin(Img2TxtMoondreamPlugin, quiet)
     register_plugin(RembgPlugin, quiet)
     register_plugin(ImgUpresPlugin, quiet)
-    register_plugin(MusicGenPlugin, quiet)
+    register_plugin(Txt2WavMusicGenPlugin, quiet)
     register_plugin(ExllamaV2Plugin, quiet)
     register_plugin(CausalLMPlugin, quiet)
     register_plugin(Txt2ModelShapEPlugin, quiet)
@@ -339,9 +339,7 @@ def unload_plugin(plugin: type[PluginBase]):
         return
 
     if plugin.instance is None:
-        return
-
-
+        return    
 
     if hasattr(plugin.instance, "offload"):
         logging.info(f"Offloading plugin: {plugin.name}")

diff --git a/plugins/img2vid_xt.py b/plugins/img2vid_xt.py
@@ -53,7 +53,7 @@ def __init__(self):
 
         super().__init__()
 
-        self.dtype = autodetect_dtype(bf16_allowed=False)
+        self.dtype = autodetect_dtype(allow_bf16=False)
 
         try:
             animatelcm_weights_path = huggingface_hub.hf_hub_download(

diff --git a/plugins/musicgen.py b/plugins/musicgen.py
diff --git a/plugins/tts.py b/plugins/tts.py
@@ -53,7 +53,7 @@ def __init__(self):
         model_path = cached_snapshot(TTS_MODEL)
 
         config = XttsConfig()
-        config.load_json(os.path.join(model_path, "config.json"))        
+        config.load_json(os.path.join(model_path, "config.json"))
 
         model: Xtts = Xtts.init_from_config(
             config, device=self.device, torch_dtype=self.dtype
@@ -84,9 +84,7 @@ def load_voice(self, voice: str):
         speaker_wav = os.path.join(TTS_VOICES_PATH, f"{voice}.wav")
 
         if not os.path.exists(speaker_wav):
-            raise HTTPException(
-                status_code=400, detail=f"Voice {voice} not found"
-            )
+            raise HTTPException(status_code=400, detail=f"Voice {voice} not found")
 
         if speaker_wav != self.current_speaker_wav:
             logging.info(f"Loading voice: {voice}")
@@ -152,7 +150,7 @@ async def generate_speech_streaming(self, req: TTSRequest):
         for sentence in sentences:
             text_buffer += sentence
             i += 1
-            if len(text_buffer) > 100 or i > 2:
+            if len(text_buffer) > 80 or (len(text_buffer) > 30 and i > 2):
                 if sentence_groups and len(text_buffer + sentence_groups[-1]) < 150:
                     # handle cases where we could have fit the sentence in the last group
                     sentence_groups[-1] += text_buffer
@@ -194,6 +192,8 @@ async def generate_speech_streaming(self, req: TTSRequest):
                 if self.interrupt:
                     break
 
+                self.busy = True
+
                 chunks.append(chunk)
 
                 if len(chunks) < self.prebuffer_chunks: