-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathtts.py
106 lines (79 loc) · 2.83 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
try:
from ..llm import *
from ..utils.db import *
from .tts_providers.openai import tts_openai
from .tts_providers.microsoft_local import tts_microsoft_local
except ImportError:
from llm import *
from utils.db import *
from audio.tts_providers.openai import tts_openai
from audio.tts_providers.microsoft_local import tts_microsoft_local
import os
import hashlib
import random
import threading
def is_local_tts_available():
try:
return True
except:
return False
def is_openai_tts_available():
the_model = load_model_settings()
if llm_settings[the_model]["provider"] == "openai":
if load_api_key() != "CHANGE_ME":
return True
return False
supported_openai_speakers = ["fable"]
def random_model(exclude):
models = supported_openai_speakers.copy()
models.remove(exclude)
return random.choice(models)
def generate_speech_chunk(text_chunk, index, voice, results):
sha = hashlib.sha256(text_chunk.encode()).hexdigest()
location = os.path.join(artifacts_dir, f"{sha}.mp3")
if os.path.exists(location):
results[index] = location
else:
the_model = load_model_settings()
tts_setting = load_tts_model_settings()
if tts_setting == "openai":
tts_openai(voice, text_chunk, location)
if tts_setting == "microsoft_local":
if not is_local_tts_available():
print("Please install gpt-computer-agent[local_tts] to use local TTS")
else:
tts_microsoft_local(text_chunk, location)
results[index] = location
def split_text_to_sentences(text, max_chunk_size=300):
"""Splits text into sentences and ensures chunks do not exceed max_chunk_size."""
sentences = text.split(".")
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
current_chunk += sentence + ". "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def text_to_speech(text):
text_chunks = split_text_to_sentences(text)
threads = []
results = [None] * len(text_chunks)
initial_voice = random.choice(supported_openai_speakers)
for i, chunk in enumerate(text_chunks):
voice = (
initial_voice if i % 2 == 0 else random_model(initial_voice)
) # Alternate voices
thread = threading.Thread(
target=generate_speech_chunk, args=(chunk, i, voice, results)
)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
mp3_files = [result for result in results if result is not None]
return mp3_files