forked from AlexandreSajus/JARVIS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
130 lines (109 loc) · 3.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Main file for the Jarvis project"""
import os
from os import PathLike
from time import time
import asyncio
from typing import Union
from dotenv import load_dotenv
import openai
from deepgram import Deepgram
import pygame
from pygame import mixer
import elevenlabs
from record import speech_to_text
# Load API keys
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
elevenlabs.set_api_key(os.getenv("ELEVENLABS_API_KEY"))
# Initialize APIs
gpt_client = openai.Client(api_key=OPENAI_API_KEY)
deepgram = Deepgram(DEEPGRAM_API_KEY)
# mixer is a pygame module for playing audio
mixer.init()
# Change the context if you want to change Jarvis' personality
context = "You are Jarvis, Alex's human assistant. You are witty and full of personality. Your answers should be limited to 1-2 short sentences."
conversation = {"Conversation": []}
RECORDING_PATH = "audio/recording.wav"
def request_gpt(prompt: str) -> str:
"""
Send a prompt to the GPT-3 API and return the response.
Args:
- state: The current state of the app.
- prompt: The prompt to send to the API.
Returns:
The response from the API.
"""
response = gpt_client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"{prompt}",
}
],
model="gpt-3.5-turbo",
)
return response.choices[0].message.content
async def transcribe(
file_name: Union[Union[str, bytes, PathLike[str], PathLike[bytes]], int]
):
"""
Transcribe audio using Deepgram API.
Args:
- file_name: The name of the file to transcribe.
Returns:
The response from the API.
"""
with open(file_name, "rb") as audio:
source = {"buffer": audio, "mimetype": "audio/wav"}
response = await deepgram.transcription.prerecorded(source)
return response["results"]["channels"][0]["alternatives"][0]["words"]
def log(log: str):
"""
Print and write to status.txt
"""
print(log)
with open("status.txt", "w") as f:
f.write(log)
if __name__ == "__main__":
while True:
# Record audio
log("Listening...")
speech_to_text()
log("Done listening")
# Transcribe audio
current_time = time()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
words = loop.run_until_complete(transcribe(RECORDING_PATH))
string_words = " ".join(
word_dict.get("word") for word_dict in words if "word" in word_dict
)
with open("conv.txt", "a") as f:
f.write(f"{string_words}\n")
transcription_time = time() - current_time
log(f"Finished transcribing in {transcription_time:.2f} seconds.")
# Get response from GPT-3
current_time = time()
context += f"\nAlex: {string_words}\nJarvis: "
response = request_gpt(context)
context += response
gpt_time = time() - current_time
log(f"Finished generating response in {gpt_time:.2f} seconds.")
# Convert response to audio
current_time = time()
audio = elevenlabs.generate(
text=response, voice="Adam", model="eleven_monolingual_v1"
)
elevenlabs.save(audio, "audio/response.wav")
audio_time = time() - current_time
log(f"Finished generating audio in {audio_time:.2f} seconds.")
# Play response
log("Speaking...")
sound = mixer.Sound("audio/response.wav")
# Add response as a new line to conv.txt
with open("conv.txt", "a") as f:
f.write(f"{response}\n")
sound.play()
pygame.time.wait(int(sound.get_length() * 1000))
print(f"\n --- USER: {string_words}\n --- JARVIS: {response}\n")