🎉 Adds realistic voice

nexus3-66 · Dec 16, 2023 · 5056c07 · 5056c07
1 parent c7915e9
commit 5056c07
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 12 deletions.
diff --git a/main.py b/main.py
@@ -9,6 +9,7 @@
 from gtts import gTTS
 import pygame
 from pygame import mixer
+import elevenlabs
 
 from record import SpeechToText
 
@@ -17,10 +18,11 @@
 
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
+elevenlabs.set_api_key(os.getenv("ELEVENLABS_API_KEY"))
 RECORDING_PATH = "wavs/recording.wav"
 
 gpt_client = openai.Client(api_key=OPENAI_API_KEY)
-context = "You are Sam, Alex's helpful secretary. Your answers should be limited to 1-2 short sentences."
+context = "You are Jarvis, Alex's helpful and witty assistant. Your answers should be limited to 1-2 short sentences."
 
 mixer.init()
 
@@ -69,26 +71,31 @@ async def transcribe(
 if __name__ == "__main__":
     while True:
         # Record audio
+        print("Listening...", end="")
         SpeechToText()
         # Transcribe audio
+        print("Transcribing...")
         deepgram = Deepgram(DEEPGRAM_API_KEY)
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
         words = loop.run_until_complete(transcribe(RECORDING_PATH))
         string_words = " ".join(
             word_dict.get("word") for word_dict in words if "word" in word_dict
         )
-        print(f"USER: {string_words}")
         # Get response from GPT-3
-        context += f"\nAlex: {string_words}\nSam: "
+        print("Generating response...")
+        context += f"\nAlex: {string_words}\nJarvis: "
         response = request_gpt(context)
-        print(f"AI: {response}")
         context += response
         # Convert response to audio
-        tts = gTTS(response)
-        tts.save("wavs/response.wav")
+        print("Converting to audio...")
+        audio = elevenlabs.generate(
+            text=response, voice="Adam", model="eleven_monolingual_v1"
+        )
+        elevenlabs.save(audio, "wavs/response.wav")
         # Play response
+        print("Speaking...")
         sound = mixer.Sound("wavs/response.wav")
         sound.play()
         pygame.time.wait(int(sound.get_length() * 1000))
-        print("LOG: Response played")
+        print(f"\n --- USER: {string_words}\n --- JARVIS: {response}\n")
diff --git a/record.py b/record.py
@@ -42,7 +42,6 @@ def SpeechToText() -> None:
         frames_per_buffer=960,
     )
     audio_source.start_stream()
-    print("LOG: Listening...")
 
     def buffer_to_wav(buffer: bytes) -> bytes:
         """Wraps a buffer of raw audio data in a WAV"""
@@ -77,8 +76,6 @@ def buffer_to_wav(buffer: bytes) -> bytes:
                     )
                     wav_bytes = buffer_to_wav(audio_data)
                     wav_path.write_bytes(wav_bytes)
-                    print(wav_path)
-                    print("file saved")
                     break
                 elif wav_sink:
                     # Write to WAV file
@@ -90,7 +87,6 @@ def buffer_to_wav(buffer: bytes) -> bytes:
     finally:
         try:
             audio_source.close_stream()
-            print("LOG: Recording Saved")
         except Exception:
             pass
 

diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,5 @@ openai==1.4.0
 deepgram-sdk==2.12.0
 pyaudio==0.2.14
 rhasspy-silence==0.4.0
-gTTS==2.4.0
+elevenlabs==0.2.27
 pygame==2.5.2
diff --git a/wavs/recording.wav b/wavs/recording.wav
diff --git a/wavs/response.wav b/wavs/response.wav