Skip to content

Commit

Permalink
若干杂项,界面优化 (RVC-Boss#1387)
Browse files Browse the repository at this point in the history
推理界面布局优化,删去文本切分小框
python webui.py or inference_webui.py  <version(optional)>   <language(optional)>
支持启动时选择模型版本,填在第一个即可,若缺失使用缺省值
支持启动时选择i18n语言,填在最后一个即可,若缺失使用缺省值
计时优化
  • Loading branch information
XXXXRT666 authored Aug 3, 2024
1 parent 6ec3a66 commit 50a63c5
Show file tree
Hide file tree
Showing 21 changed files with 117 additions and 87 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ logs
reference
GPT_weights
SoVITS_weights
TEMP
TEMP
gweight.txt
sweight.txt
122 changes: 72 additions & 50 deletions GPT_SoVITS/inference_webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
logging.getLogger("asyncio").setLevel(logging.ERROR)
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
import LangSegment,os, re
import LangSegment, os, re, sys
import pdb
import torch

version=os.environ.get("version","v1")
language=os.environ.get("language","auto")
version="v2"if sys.argv[0]=="v2" else version
language=sys.argv[-1] if sys.argv[-1]!='v2' and sys.argv[-1]!='v1' else language
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"if version=="v1"else"GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"
pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"if version=="v1"else "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"

Expand Down Expand Up @@ -72,7 +75,10 @@
from tools.my_utils import load_audio
from tools.i18n.i18n import I18nAuto

i18n = I18nAuto()
if language != 'auto':
i18n = I18nAuto(language=language)
else:
i18n = I18nAuto()

# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。

Expand Down Expand Up @@ -338,6 +344,7 @@ def merge_short_text_in_array(texts, threshold):
cache= {}
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False):
global cache
t = []
if prompt_text is None or len(prompt_text) == 0:
ref_free = True
t0 = ttime()
Expand Down Expand Up @@ -379,6 +386,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
prompt = prompt_semantic.unsqueeze(0).to(device)

t1 = ttime()
t.append(t1-t0)

if (how_to_cut == i18n("凑四句一切")):
text = cut1(text)
Expand Down Expand Up @@ -449,7 +457,11 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
audio_opt.append(audio)
audio_opt.append(zero_wav)
t4 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
t.extend([t2 - t1,t3 - t2, t4 - t3])
t1 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" %
(t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))
)
yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
np.int16
)
Expand Down Expand Up @@ -594,79 +606,89 @@ def get_weights_names():

SoVITS_names, GPT_names = get_weights_names()

def html_center(text, label='p'):
return f"""<div style="text-align: center; margin: 100; padding: 50;">
<{label} style="margin: 0; padding: 0;">{text}</{label}>
</div>"""

def html_left(text, label='p'):
return f"""<div style="text-align: left; margin: 0; padding: 0;">
<{label} style="margin: 0; padding: 0;">{text}</{label}>
</div>"""

with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
)
with gr.Group():
gr.Markdown(value=i18n("模型切换"))
gr.Markdown(html_center(i18n("模型切换"),'h3'))
with gr.Row():
GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True,scale=13)
SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True,scale=13)
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary",scale=13)
refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
gr.Markdown(value=i18n("*请上传并填写参考信息"))
gr.Markdown(html_center(i18n("*请上传并填写参考信息"),'h3'))
with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
with gr.Column():
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath",scale=13)
with gr.Column(scale=13):
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开开启后无视填写的参考文本。"))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。")))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=3, max_lines=3)
prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
label=i18n("参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文"),scale=14
)
gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"),'h3'))
with gr.Row():
with gr.Column():
text = gr.Textbox(label=i18n("需要合成的文本"), value="")
with gr.Column(scale=13):
text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26)
with gr.Column(scale=7):
text_language = gr.Dropdown(
label=i18n("需要合成的语种"), choices=list(dict_language.keys()), value=i18n("中文")
)
how_to_cut = gr.Radio(
label=i18n("怎么切"),
choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
value=i18n("凑四句一切"),
interactive=True,
)
with gr.Column():
gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低。不懂就用默认):"))
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=10,interactive=True)
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
with gr.Column():
gr.Markdown(value=i18n("语速调整,高为更快"))
if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速。防止随机性。"), value=False, interactive=True, show_label=True)
speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True)
label=i18n("需要合成的语种"), choices=list(dict_language.keys()), value=i18n("中文"), scale=1
)
how_to_cut = gr.Dropdown(
label=i18n("怎么切"),
choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
value=i18n("凑四句一切"),
interactive=True, scale=1
)
gr.Markdown(value=html_center(i18n("语速调整,高为更快")))
if_freeze=gr.Checkbox(label=i18n("是否直接对上次合成结果调整语速。防止随机性。"), value=False, interactive=True,show_label=True, scale=1)
speed = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label=i18n("语速"),value=1,interactive=True, scale=1)
gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):")))
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=10,interactive=True, scale=1)
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True, scale=1)
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True, scale=1)
# with gr.Column():
# gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理,无视目标文本框。"))
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
inference_button = gr.Button(i18n("合成语音"), variant="primary")
output = gr.Audio(label=i18n("输出的语音"))
with gr.Row():
inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
output = gr.Audio(label=i18n("输出的语音"),scale=14)

inference_button.click(
get_tts_wav,
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze],
[output],
)

gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
with gr.Row():
text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
button1 = gr.Button(i18n("凑四句一切"), variant="primary")
button2 = gr.Button(i18n("凑50字一切"), variant="primary")
button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
button5 = gr.Button(i18n("按标点符号切"), variant="primary")
text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
button1.click(cut1, [text_inp], [text_opt])
button2.click(cut2, [text_inp], [text_opt])
button3.click(cut3, [text_inp], [text_opt])
button4.click(cut4, [text_inp], [text_opt])
button5.click(cut5, [text_inp], [text_opt])
gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
# with gr.Row():
# text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
# button1 = gr.Button(i18n("凑四句一切"), variant="primary")
# button2 = gr.Button(i18n("凑50字一切"), variant="primary")
# button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
# button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
# button5 = gr.Button(i18n("按标点符号切"), variant="primary")
# text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
# button1.click(cut1, [text_inp], [text_opt])
# button2.click(cut2, [text_inp], [text_opt])
# button3.click(cut3, [text_inp], [text_opt])
# button4.click(cut4, [text_inp], [text_opt])
# button5.click(cut5, [text_inp], [text_opt])
# gr.Markdown(html_center(i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")))

if __name__ == '__main__':
app.queue(concurrency_count=511, max_size=1022).launch(
Expand Down
2 changes: 1 addition & 1 deletion go-webui-v2.bat
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
runtime\python.exe webui.py v2
runtime\python.exe webui.py v2 zh_CN
pause
2 changes: 1 addition & 1 deletion go-webui-v2.ps1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
$ErrorActionPreference = "SilentlyContinue"
chcp 65001
& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py v2"
& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py v2 zh_CN"
pause
2 changes: 1 addition & 1 deletion go-webui.bat
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
runtime\python.exe webui.py
runtime\python.exe webui.py zh_CN
pause
2 changes: 1 addition & 1 deletion go-webui.ps1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
$ErrorActionPreference = "SilentlyContinue"
chcp 65001
& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py"
& "$PSScriptRoot\runtime\python.exe" "$PSScriptRoot\webui.py zh_CN"
pause
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ pytorch-lightning
gradio==3.38.0
gradio_client==0.8.1
ffmpeg-python
onnxruntime-gpu
onnxruntime; sys_platform == 'darwin'
onnxruntime-gpu; sys_platform != 'darwin'
tqdm
funasr==1.0.27
cn2an
Expand Down
6 changes: 3 additions & 3 deletions tools/i18n/locale/en_US.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"UVR5已开启": "UVR5 opened ",
"UVR5进程输出信息": "UVR5 process output log",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
"gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT sampling parameters (not too low when there's no reference text. Use default if unsure):",
"GPT采样参数(无参考文本时不要太低。不懂就用默认):": "GPT sampling parameters (not too low when there's no reference text. Use default if unsure):",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
"max:归一化后最大值多少": "Loudness multiplier after normalized",
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
Expand All @@ -70,7 +70,7 @@
"人声伴奏分离批量处理, 使用UVR5模型。": "Batch processing for vocal and instrumental separation, using the UVR5 model.",
"人声提取激进程度": "Vocal extraction aggressiveness",
"伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal",
"使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开开启后无视填写的参考文本。": "When using the no-reference text mode, it is recommended to use a fine-tuned GPT. If the reference audio is unclear and you don't know what to write, you can enable this feature, which will ignore the reference text you've entered.",
"使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "When using the no-reference text mode, it is recommended to use a fine-tuned GPT. If the reference audio is unclear and you don't know what to write, you can enable this feature, which will ignore the reference text you've entered.",
"保存频率save_every_epoch": "Save frequency (save_every_epoch):",
"凑50字一切": "Slice per 50 characters",
"凑四句一切": "Slice once every 4 sentences",
Expand Down Expand Up @@ -122,7 +122,7 @@
"日英混合": "Japanese-English Mixed",
"是否仅保存最新的ckpt文件以节省硬盘空间": "Save only the latest '.ckpt' file to save disk space:",
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:",
"是否开启TTS推理WebUI": "Open TTS inference WEBUI",
"是否开启TTS推理WebUI": "Open TTS inference WebUI",
"是否开启UVR5-WebUI": "Open UVR5-WebUI",
"是否开启dpo训练选项(实验性)": "Enable DPO training (experimental feature)",
"是否开启打标WebUI": "Open labelling WebUI",
Expand Down
4 changes: 2 additions & 2 deletions tools/i18n/locale/es_ES.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"UVR5已开启": "UVR5 está habilitado",
"UVR5进程输出信息": "Información de salida del proceso UVR5",
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
"gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Parámetros de muestreo de GPT (no demasiado bajos cuando no hay texto de referencia. Use los valores por defecto si no está seguro):",
"GPT采样参数(无参考文本时不要太低。不懂就用默认):": "Parámetros de muestreo de GPT (no demasiado bajos cuando no hay texto de referencia. Use los valores por defecto si no está seguro):",
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
"max:归一化后最大值多少": "max: valor máximo después de la normalización",
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: duración máxima del silencio después del corte",
Expand All @@ -70,7 +70,7 @@
"人声伴奏分离批量处理, 使用UVR5模型。": "Procesamiento por lotes de separación de voz y acompañamiento utilizando el modelo UVR5",
"人声提取激进程度": "Nivel de agresividad en la extracción de voz",
"伴奏人声分离&去混响&去回声": "Separación de acompañamiento y voz principal y eliminación de reverberación y eco",
"使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开开启后无视填写的参考文本。": "Se recomienda usar un GPT ajustado en modo sin texto de referencia; habilítelo si no puede entender el audio de referencia (si no sabe qué escribir). Una vez habilitado, ignorará el texto de referencia ingresado.",
"使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开。<br>开启后无视填写的参考文本。": "Se recomienda usar un GPT ajustado en modo sin texto de referencia; habilítelo si no puede entender el audio de referencia (si no sabe qué escribir). Una vez habilitado, ignorará el texto de referencia ingresado.",
"保存频率save_every_epoch": "Frecuencia de guardado (cada epoch)",
"凑50字一切": "Todo para alcanzar las 50 palabras",
"凑四句一切": "Completa cuatro oraciones para rellenar todo",
Expand Down
Loading

0 comments on commit 50a63c5

Please sign in to comment.