forked from HRNPH/AIwaifu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
128 lines (112 loc) · 5.17 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
print('Initializing... Dependencies')
from conversation import character_msg_constructor
from vtube_studio import Char_control
import romajitable # temporary use this since It'll blow up our ram if we use Machine Translation Model
import pyaudio
import soundfile as sf
import scipy.io.wavfile as wavfile
import requests
import random
import os
import logging
logging.getLogger("requests").setLevel(logging.WARNING) # make requests logging only important stuff
logging.getLogger("urllib3").setLevel(logging.WARNING) # make requests logging only important stuff
talk = character_msg_constructor("Lilia", None) # initialize character_msg_constructor
# ----------- Waifu Vocal Pipeline -----------------------
from AIVoifu.tts import tts
from AIVoifu.voice_conversion import vc_inference as vc
class tts_pipeline:
def __init__(self) -> None:
print('Loading Waifu Vocal Pipeline...')
self.cache_root = './audio_cache'
self.model = tts.OpenJtalk()
self.vc_model = vc.vits_vc_inference(load_model=True)
print('Loaded Waifu Vocal Pipeline')
def tts(self, text, voice_conversion=True, save_path=None):
if not save_path:
save_path = f'{self.cache_root}/dialog_cache.wav'
self.model.tts(text, save_path)
if voice_conversion:
self.vc_model.convert(save_path, 22050, from_file=True, save_path=save_path)
return save_path
vocal_pipeline = tts_pipeline()
# initialize Vstudio Waifu Controller
print('Initializing... Vtube Studio')
waifu = Char_control(port=8001, plugin_name='MyBitchIsAI', plugin_developer='HRNPH')
print('Initialized')
# chat api
def chat(msg, reset=False):
command = 'chat'
if reset:
command = 'reset'
params = {
'command': f'{command}',
'data': msg,
}
try:
r = requests.get('http://localhost:8267/waifuapi', params=params)
except requests.exceptions.ConnectionError as e:
print('--------- Exception Occured ---------')
print('if you have run the server on different device, please specify the ip address of the server with the port')
print('Example: http://192.168.1.112:8267 or leave it blank to use localhost')
print('***please specify the ip address of the server with the port*** at:')
print(f'*Line {e.__traceback__.tb_lineno}: {e}')
print('-------------------------------------')
exit()
return r.text
split_counter = 0
history = ''
while True:
con = str(input("You: "))
if con.lower() == 'exit':
print('Stopping...')
break # exit prototype
if con.lower() == 'reset':
print('Resetting...')
print(chat('None', reset=True))
continue # reset story skip to next loop
# ----------- Create Response --------------------------
answer = chat(con) # send message to api
answer = answer.split('<split_token>')
answer, japanese_answer = answer[0], answer[1]
answer.replace('Lilia:', '') # remove name from answer
# ------------------------------------------------------
print(f'{answer}')
if answer.strip().endswith(f'{talk.name}:') or answer.strip() == '':
continue # skip audio processing if the answer is just the name (no talking)
# ----------- Waifu Create Talking Audio -----------------------
## generate speaking voice (Translates to Japanese and then TTS)/(Japanglish TTS)[take lower memory but not as enjoyable]
## we'll use Japanglish TTS for prototyping, we'll use Japanese TTS for production
# cleaned_text = talk.clean_emotion_action_text_for_speech(answer) # delete *describe* in text and left with only "speaking" part
# translated = romajitable.to_kana(cleaned_text).hiragana # translate to Japanese
# using Japanglish TTS we don't need to clean the text since server already did it before translating
translated = japanese_answer
# _, (sr, audio) = tts(translated, 0)
# random_name = '_cache' #random.randint(0, 1000)
# wavfile.write(f'./audio_cache/dialog{random_name}.wav', sr, audio)
vocal_pipeline.tts(translated, save_path=f'./audio_cache/dialog_cache.wav')
# --------------------------------------------------
# ----------- Waifu Talking -----------------------
# play audio directly from cache
p = pyaudio.PyAudio()
data, samplerate = sf.read('./audio_cache/dialog_cache.wav', dtype='float32')
stream = p.open(format=pyaudio.paFloat32,
channels=1,
rate=samplerate,
output=True)
stream.write(data.tobytes())
stream.stop_stream()
stream.close()
# --------------------------------------------------
# ----------- Waifu Expressing ----------------------- (emotion expressed)
emotion = talk.emotion_analyze(answer) # get emotion from waifu answer (last line)
print(f'Emotion Log: {emotion}')
emotion_to_express = None
if 'joy' in emotion:
emotion_to_express = 'happy'
elif 'anger' in emotion:
emotion_to_express = 'angry'
print(f'Emotion to express: {emotion_to_express}')
if emotion_to_express: ## express emotion
waifu.express(emotion_to_express) # express emotion in Vtube Studio
# --------------------------------------------------