feat: support multiple language voice

dingchaoz · Mar 22, 2023 · 391ef88 · 391ef88
1 parent d96ea0e
commit 391ef88
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ For now it is in development, but you can try it out by join this [channel](http
   - [ ] Consider to fine-tue the chunk size of index node and prompt to save the cost
     - If the chunk size is too big, it will cause the index node to be too large and the cost will be high.
 - [x] Bot can read historical messages from the same thread, thus providing context to chatGPT
+  - [x] [Changing the number of output tokens](https://github.com/jerryjliu/llama_index/issues/778#issuecomment-1478303173)
 - Index fine-tune
   - [x] Use the [GPTListIndex](https://github.com/jerryjliu/llama_index/issues/753#issuecomment-1472387421) to summarize multiple URLs
   - [ ] Use the `GPTTreeIndex` with `summarize` mode to summarize a single web page
@@ -69,6 +70,15 @@ For now it is in development, but you can try it out by join this [channel](http
     - may use GPT4
 - [x] Support voice reading ~~with self-hosting [whisper](https://github.com/aarnphm/whispercpp)~~
   - (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥
+  - Support language
+    - Chinese
+    - English
+      - 🇺🇸
+      - 🇬🇧
+      - 🇦🇺
+      - 🇮🇳
+    - Japanese
+    - German
 - [ ] Integrated with Azure OpenAI Service
 - [ ] User access limit
   - Limit the number of requests to bot per user per day to save the cost

diff --git a/app/gpt.py b/app/gpt.py
@@ -2,14 +2,15 @@
 import os
 import logging
 import hashlib
+import random
 import uuid
 import openai
 from langdetect import detect
 from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader, SimpleDirectoryReader
 from llama_index.prompts.prompts import QuestionAnswerPrompt
 from llama_index.readers.schema.base import Document
 from langchain.chat_models import ChatOpenAI
-from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, ResultReason, CancellationReason, SpeechSynthesisOutputFormat, AudioDataStream
+from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, ResultReason, CancellationReason, SpeechSynthesisOutputFormat
 from azure.cognitiveservices.speech.audio import AudioOutputConfig
 
 from app.fetch_web_post import get_urls, scrape_website, scrape_website_by_phantomjscloud
@@ -20,7 +21,7 @@
 openai.api_key = OPENAI_API_KEY
 
 llm_predictor = LLMPredictor(llm=ChatOpenAI(
-    temperature=0.2, model_name="gpt-3.5-turbo"))
+    temperature=0.2, model_name="gpt-3.5-turbo", max_tokens=4097))
 
 index_cache_web_dir = '/tmp/myGPTReader/cache_web/'
 index_cache_voice_dir = '/tmp/myGPTReader/voice/'
@@ -150,19 +151,28 @@ def get_text_from_whisper(voice_file_path):
 def remove_prompt_from_text(text):
     return text.replace('AI:', '').strip()
 
-def convert_to_ssml(text):
+lang_code_voice_map = {
+    'zh': ['zh-CN-XiaoxiaoNeural', 'zh-CN-XiaohanNeural', 'zh-CN-YunxiNeural', 'zh-CN-YunyangNeural'],
+    'en': ['en-US-JennyNeural', 'en-US-RogerNeural', 'en-IN-NeerjaNeural', 'en-IN-PrabhatNeural', 'en-AU-AnnetteNeural', 'en-AU-CarlyNeural', 'en-GB-AbbiNeural', 'en-GB-AlfieNeural'],
+    'ja': ['ja-JP-AoiNeural', 'ja-JP-DaichiNeural'],
+    'de': ['de-DE-AmalaNeural', 'de-DE-BerndNeural'],
+}
+
+def convert_to_ssml(text, voice_name=None):
     lang_code = detect(text)
     text = remove_prompt_from_text(text)
+    if voice_name is None:
+        try:
+            voice_name = random.choice(lang_code_voice_map[lang_code.split('-')[0]])
+        except KeyError:
+            voice_name = random.choice(lang_code_voice_map['en-us'])
     ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="zh-CN">'
-    if lang_code == 'zh-cn':
-        ssml += f'<voice name="zh-CN-XiaoxiaoNeural">{text}</voice>'
-    else:
-        ssml += f'<voice name="en-US-JennyNeural">{text}</voice>'
+    ssml += f'<voice name="{voice_name}">{text}</voice>'
     ssml += '</speak>'
 
     return ssml
 
-def get_voice_file_from_text(text):
+def get_voice_file_from_text(text, voice_name=None):
     speech_config = SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
     speech_config.set_speech_synthesis_output_format(
         SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
@@ -171,7 +181,7 @@ def get_voice_file_from_text(text):
     file_config = AudioOutputConfig(filename=file_name)
     synthesizer = SpeechSynthesizer(
         speech_config=speech_config, audio_config=file_config)
-    ssml = convert_to_ssml(text)
+    ssml = convert_to_ssml(text, voice_name)
     result = synthesizer.speak_ssml_async(ssml).get()
     if result.reason == ResultReason.SynthesizingAudioCompleted:
         logging.info("Speech synthesized for text [{}], and the audio was saved to [{}]".format(

diff --git a/app/server.py b/app/server.py
@@ -192,8 +192,8 @@ def handle_mentions(event, say, logger):
             say(f'<@{user}>, {gpt_response}', thread_ts=thread_ts)
         else:
             voice_file_path = get_voice_file_from_text(gpt_response)
+            logger.info(f'=====> Voice file path is {voice_file_path}')
             slack_app.client.files_upload_v2(file=voice_file_path, channel=channel, thread_ts=thread_ts)
-            say(f'<@{user}>', thread_ts=thread_ts)
     except concurrent.futures.TimeoutError:
         future.cancel()
         err_msg = 'Task timedout(5m) and was canceled.'