Skip to content

Commit

Permalink
multiple improvements, see notes
Browse files Browse the repository at this point in the history
- updated xtts
- bf16_allowed renamed to allow_bf16 to conform with other naming
- musicgen endpoint has moved to /txt2wav/musicgen
- musicgen is now powered by the official audiocraft repo
- fixed musicgen format=mp3 output
- improved silence detection in voice conversations
  • Loading branch information
JohnnyStreet committed Sep 30, 2024
1 parent 4c19944 commit 2b19c18
Show file tree
Hide file tree
Showing 18 changed files with 510 additions and 433 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,6 @@
[submodule "submodules/piano_transcription_inference"]
path = submodules/piano_transcription_inference
url = https://github.com/monofy-org/piano_transcription_inference
[submodule "submodules/stable_audio_tools"]
path = submodules/stable_audio_tools
url = https://github.com/monofy-org/stable-audio-tools
8 changes: 3 additions & 5 deletions modules/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def load_plugins():
from plugins.experimental.img_upres import ImgUpresPlugin
from plugins.img2txt_moondream import Img2TxtMoondreamPlugin
from plugins.img2txt_llava import Img2TxtLlavaPlugin
from plugins.musicgen import MusicGenPlugin
from plugins.txt2wav_musicgen import Txt2WavMusicGenPlugin
from plugins.exllamav2 import ExllamaV2Plugin
from plugins.experimental.causal_lm import CausalLMPlugin
from plugins.txt2model_shap_e import Txt2ModelShapEPlugin
Expand Down Expand Up @@ -126,7 +126,7 @@ def load_plugins():
register_plugin(Img2TxtMoondreamPlugin, quiet)
register_plugin(RembgPlugin, quiet)
register_plugin(ImgUpresPlugin, quiet)
register_plugin(MusicGenPlugin, quiet)
register_plugin(Txt2WavMusicGenPlugin, quiet)
register_plugin(ExllamaV2Plugin, quiet)
register_plugin(CausalLMPlugin, quiet)
register_plugin(Txt2ModelShapEPlugin, quiet)
Expand Down Expand Up @@ -339,9 +339,7 @@ def unload_plugin(plugin: type[PluginBase]):
return

if plugin.instance is None:
return


return

if hasattr(plugin.instance, "offload"):
logging.info(f"Offloading plugin: {plugin.name}")
Expand Down
2 changes: 1 addition & 1 deletion plugins/img2vid_xt.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self):

super().__init__()

self.dtype = autodetect_dtype(bf16_allowed=False)
self.dtype = autodetect_dtype(allow_bf16=False)

try:
animatelcm_weights_path = huggingface_hub.hf_hub_download(
Expand Down
201 changes: 0 additions & 201 deletions plugins/musicgen.py

This file was deleted.

10 changes: 5 additions & 5 deletions plugins/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self):
model_path = cached_snapshot(TTS_MODEL)

config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
config.load_json(os.path.join(model_path, "config.json"))

model: Xtts = Xtts.init_from_config(
config, device=self.device, torch_dtype=self.dtype
Expand Down Expand Up @@ -84,9 +84,7 @@ def load_voice(self, voice: str):
speaker_wav = os.path.join(TTS_VOICES_PATH, f"{voice}.wav")

if not os.path.exists(speaker_wav):
raise HTTPException(
status_code=400, detail=f"Voice {voice} not found"
)
raise HTTPException(status_code=400, detail=f"Voice {voice} not found")

if speaker_wav != self.current_speaker_wav:
logging.info(f"Loading voice: {voice}")
Expand Down Expand Up @@ -152,7 +150,7 @@ async def generate_speech_streaming(self, req: TTSRequest):
for sentence in sentences:
text_buffer += sentence
i += 1
if len(text_buffer) > 100 or i > 2:
if len(text_buffer) > 80 or (len(text_buffer) > 30 and i > 2):
if sentence_groups and len(text_buffer + sentence_groups[-1]) < 150:
# handle cases where we could have fit the sentence in the last group
sentence_groups[-1] += text_buffer
Expand Down Expand Up @@ -194,6 +192,8 @@ async def generate_speech_streaming(self, req: TTSRequest):
if self.interrupt:
break

self.busy = True

chunks.append(chunk)

if len(chunks) < self.prebuffer_chunks:
Expand Down
Loading

0 comments on commit 2b19c18

Please sign in to comment.