Skip to content

Commit

Permalink
Merge pull request prophesier#2 from IceKyrin/main
Browse files Browse the repository at this point in the history
long audio process in memory
  • Loading branch information
prophesier authored Nov 1, 2022
2 parents 64b25c2 + 9bd507c commit 7c3cb12
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 176 deletions.
80 changes: 39 additions & 41 deletions infer.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,65 @@
import logging
import io
import os
from pathlib import Path
import soundfile

import numpy as np
import soundfile

from infer_tools import infer_tool
# from infer_tools import merge
from infer_tools import slicer
from infer_tools.infer_tool import Svc

def run_clip(svc_model,key, acc, use_pe, use_crepe,thre, use_gt_mel, add_noise_step,project_name='',f_name=None,file_path=None,out_path=None):
infer_tool.mkdir(["./raw", "./results"])
input_wav_path = "./infer_tools/wav_temp/input"
#out_wav_path = "./infer_tools/wav_temp/output"
cut_time = 30
infer_tool.mkdir(["./infer_tools/wav_temp", input_wav_path])
infer_tool.del_temp_wav(input_wav_path)
if file_path==None:

def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
file_path=None, out_path=None):
if file_path is None:
raw_audio_path = f"./raw/{f_name}"
clean_name = f_name[:-4]
else:
raw_audio_path=file_path
raw_audio_path = file_path
clean_name = str(Path(file_path).name)[:-4]
infer_tool.format_wav(raw_audio_path)
infer_tool.del_temp_wav("./infer_tools/wav_temp")
infer_tool.cut_wav(Path(raw_audio_path).with_suffix('.wav'), clean_name, input_wav_path, cut_time)
audio_data, audio_sr = slicer.cut(Path(raw_audio_path).with_suffix('.wav'))

count = 0
file_list = infer_tool.get_end_file(input_wav_path, "wav")
f0_tst=[]
f0_pred=[]
audio=[]
for file_name in file_list:
file_name = file_name.split("/")[-1]
raw_path = f"{input_wav_path}/{file_name}"
#out_path = f"{out_wav_path}/{file_name}"

f0_tst = []
f0_pred = []
audio = []
for data in audio_data:
raw_path = io.BytesIO()
soundfile.write(raw_path, data, audio_sr, format="wav")
raw_path.seek(0)
_f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, key=key, acc=acc, use_pe=use_pe, use_crepe=use_crepe,
thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
f0_tst.extend(_f0_tst)
f0_pred.extend(_f0_pred)
audio.extend(list(_audio))
count += 1
if out_path==None:
if out_path is None:
out_path = f'./results/{clean_name}_{key}key_{project_name}.wav'
soundfile.write(out_path,audio,24000,'PCM_16')
# 清除缓存文件
infer_tool.del_temp_wav(input_wav_path)
return np.array(f0_tst),np.array(f0_pred),audio
soundfile.write(out_path, audio, 24000, 'PCM_16')
return np.array(f0_tst), np.array(f0_pred), audio

if __name__=='__main__':
logging.getLogger('numba').setLevel(logging.WARNING)

if __name__ == '__main__':
# 工程文件夹名,训练时用的那个
project_name = "nyaru"
model_path = f'./DiffSingerRM/checkpoints/{project_name}/model_ckpt_steps_112000.ckpt'
project_name = "yilanqiu"
model_path = f'./checkpoints/{project_name}/model_ckpt_steps_246000.ckpt'
config_path = f'./checkpoints/{project_name}/config.yaml'
# 支持多个wav文件,放在raw文件夹下
file_names = ["群青.wav"]
trans = [0] # 音高调整,支持正负(半音)

# 支持多个wav/ogg文件,放在raw文件夹下,带扩展名
file_names = ["青花瓷.wav"]
trans = [0] # 音高调整,支持正负(半音),数量与上一行对应,不足的自动按第一个移调参数补齐
# 加速倍数
accelerate = 20
hubert_gpu=True
infer_tool.fill_a_to_b(trans, file_names)
hubert_gpu = True
cut_time = 30

# 下面不动
model = Svc(project_name,config_path,hubert_gpu, model_path)
infer_tool.mkdir(["./raw", "./results"])
infer_tool.fill_a_to_b(trans, file_names)

model = Svc(project_name, config_path, hubert_gpu, model_path)
for f_name, tran in zip(file_names, trans):
run_clip(model,key=tran,acc=accelerate,use_crepe=True,thre=0.05,use_pe=True,use_gt_mel=False,add_noise_step=500,f_name=f_name,project_name=project_name)

run_clip(model, key=tran, acc=accelerate, use_crepe=True, thre=0.05, use_pe=True, use_gt_mel=False,
add_noise_step=500, f_name=f_name, project_name=project_name)
90 changes: 21 additions & 69 deletions infer_tools/infer_tool.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,22 @@
import logging
import os
import shutil
import subprocess
import time

import librosa
import numpy as np
import soundfile
import torch
import torchaudio

import utils
from io import BytesIO
from modules.fastspeech.pe import PitchExtractor
from network.diff.candidate_decoder import FFT
from network.diff.diffusion import GaussianDiffusion
from network.diff.net import DiffNet
from network.vocoders.base_vocoder import VOCODERS
from network.vocoders.base_vocoder import get_vocoder_cls
from network.vocoders.base_vocoder import VOCODERS, get_vocoder_cls
from preprocessing.data_gen_utils import get_pitch_parselmouth, get_pitch_crepe
from preprocessing.hubertinfer import Hubertencoder
from utils.hparams import hparams
from utils.hparams import set_hparams
from utils.pitch_utils import denorm_f0
from utils.pitch_utils import norm_interp_f0

logging.getLogger('matplotlib').setLevel(logging.WARNING)
from utils.hparams import hparams, set_hparams
from utils.pitch_utils import denorm_f0, norm_interp_f0


def timeit(func):
Expand All @@ -42,32 +34,6 @@ def format_wav(audio_path):
soundfile.write(audio_path[:-4] + ".wav", raw_audio, raw_sample_rate)


def cut_wav(raw_audio_path, out_audio_name, input_wav_path, cut_time):
raw_audio, raw_sr = torchaudio.load(raw_audio_path)
if raw_audio.shape[-1] / raw_sr > cut_time:
subprocess.Popen(
f"python ./infer_tools/slicer.py {raw_audio_path} --out_name {out_audio_name} --out {input_wav_path} --db_thresh -30",
shell=True).wait()
else:
shutil.copy(raw_audio_path, f"{input_wav_path}/{out_audio_name}-00.wav")


def get_end_file(dir_path, end):
file_lists = []
for root, dirs, files in os.walk(dir_path):
files = [f for f in files if f[0] != '.']
dirs[:] = [d for d in dirs if d[0] != '.']
for f_file in files:
if f_file.endswith(end):
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
return file_lists


def del_temp_wav(path_data):
for i in get_end_file(path_data, "wav"): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
os.remove(i)


def fill_a_to_b(a, b):
if len(a) < len(b):
for _ in range(0, len(b) - len(a)):
Expand All @@ -81,7 +47,8 @@ def mkdir(paths: list):


class Svc:
def __init__(self, project_name,config_name,hubert_gpu, model_path):
def __init__(self, project_name, config_name, hubert_gpu, model_path):
self.project_name = project_name
self.DIFF_DECODERS = {
'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
'fft': lambda hp: FFT(
Expand All @@ -91,7 +58,7 @@ def __init__(self, project_name,config_name,hubert_gpu, model_path):
self.model_path = model_path
self.dev = torch.device("cuda")

self._ = set_hparams(config=config_name, exp_name=project_name, infer=True,
self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
reset=True,
hparams_str='',
print_hparams=False)
Expand All @@ -107,8 +74,8 @@ def __init__(self, project_name,config_name,hubert_gpu, model_path):
)
self.load_ckpt()
self.model.cuda()
hparams['hubert_gpu']=hubert_gpu
hparams['use_uv']=True
hparams['hubert_gpu'] = hubert_gpu
hparams['use_uv'] = True
self.hubert = Hubertencoder(hparams['hubert_path'])
self.pe = PitchExtractor().cuda()
utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
Expand Down Expand Up @@ -136,7 +103,7 @@ def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, **kwa
batch['mel2ph_pred'] = outputs['mel2ph']
batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
if use_pe:
hparams['use_uv']=True
hparams['use_uv'] = True
batch['f0_pred'] = self.pe(outputs['mel_out'])['f0_denorm_pred'].detach()
else:
batch['f0_pred'] = outputs.get('f0_denorm')
Expand All @@ -151,21 +118,12 @@ def after_infer(self, prediction):
# remove paddings
mel_gt = prediction["mels"]
mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
mel_gt = mel_gt[mel_gt_mask]
mel2ph_gt = prediction.get("mel2ph")
mel2ph_gt = mel2ph_gt[mel_gt_mask] if mel2ph_gt is not None else None

mel_pred = prediction["outputs"]
mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
mel_pred = mel_pred[mel_pred_mask]
mel_gt = np.clip(mel_gt, hparams['mel_vmin'], hparams['mel_vmax'])
mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])

mel2ph_pred = prediction.get("mel2ph_pred")
if mel2ph_pred is not None:
if len(mel2ph_pred) > len(mel_pred_mask):
mel2ph_pred = mel2ph_pred[:len(mel_pred_mask)]
mel2ph_pred = mel2ph_pred[mel_pred_mask]

f0_gt = prediction.get("f0_gt")
f0_pred = prediction.get("f0_pred")
if f0_pred is not None:
Expand All @@ -183,6 +141,7 @@ def temporary_dict2processed_input(self, item_name, temp_dict, use_crepe=True, t
'''

binarization_args = hparams['binarization_args']

@timeit
def get_pitch(wav, mel):
# get ground truth f0 by self.get_pitch_algorithm
Expand Down Expand Up @@ -230,27 +189,20 @@ def get_align(mel, phone_encoded):
get_align(mel, hubert_encoded)
return processed_input

def pre(self, in_path, accelerate, use_crepe=True, thre=0.05):
temp_dict = self.temporary_dict2processed_input(*file2temporary_dict(in_path), use_crepe, thre)
def pre(self, wav_fn, accelerate, use_crepe=True, thre=0.05):
if isinstance(wav_fn, BytesIO):
item_name = self.project_name
else:
song_info = wav_fn.split('/')
item_name = song_info[-1].split('.')[-2]
temp_dict = {'wav_fn': wav_fn, 'spk_id': self.project_name}

temp_dict = self.temporary_dict2processed_input(item_name, temp_dict, use_crepe, thre)
hparams['pndm_speedup'] = accelerate
batch = processed_input2batch([getitem(temp_dict)])
return batch


def file2temporary_dict(wav_fn):
'''
read from file, store data in temporary dicts
'''
song_info = wav_fn.split('/')
item_name = raw_item_name = song_info[-1].split('.')[-2]
temp_dict = {}

temp_dict['wav_fn'] = wav_fn
temp_dict['spk_id'] = 'opencpop'

return item_name, temp_dict


def getitem(item):
max_frames = hparams['max_frames']
spec = torch.Tensor(item['mel'])[:max_frames]
Expand Down
26 changes: 26 additions & 0 deletions infer_tools/slicer.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,5 +162,31 @@ def main():
audio[start:len(audio)], sr)


def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
audio, sr = torchaudio.load(audio_path)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = torch.mean(audio, dim=0).unsqueeze(0)
audio = audio.cpu().numpy()[0]

slicer = Slicer(
sr=sr,
db_threshold=db_thresh,
min_length=min_len,
win_l=win_l,
win_s=win_s,
max_silence_kept=max_sil_kept
)
chunks = slicer.slice(audio)
start = 0
result = []
for i, chunk in enumerate(chunks):
end = chunk
result.append(audio[start:end])
start = end
if start != len(audio):
result.append(audio[start:len(audio)])
return result, sr


if __name__ == '__main__':
main()
15 changes: 6 additions & 9 deletions network/hubert/hubert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import random
from typing import Optional, Tuple

import librosa
import numpy as np
import torch
Expand Down Expand Up @@ -230,12 +231,7 @@ def hubert_soft(
return hubert


def get_units(hbt_soft, raw_wav_path,dev=torch.device('cuda')):
# source, sr = torchaudio.load(raw_wav_path)
# source = torchaudio.functional.resample(source, sr, 16000)
# if len(source.shape) == 2 and source.shape[1] >= 2:
# source = torch.mean(source, dim=0).unsqueeze(0)
# source = source.unsqueeze(0).to(dev)
def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
wav, sr = librosa.load(raw_wav_path, sr=None)
assert (sr >= 16000)
if len(wav.shape) > 1:
Expand All @@ -244,7 +240,7 @@ def get_units(hbt_soft, raw_wav_path,dev=torch.device('cuda')):
wav16 = librosa.resample(wav, sr, 16000)
else:
wav16 = wav
dev = torch.device("cuda" if (dev==torch.device('cuda') and torch.cuda.is_available()) else "cpu")
dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
torch.cuda.is_available() and torch.cuda.empty_cache()
with torch.inference_mode():
units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
Expand All @@ -264,16 +260,17 @@ def get_end_file(dir_path, end):

if __name__ == '__main__':
from pathlib import Path

dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# hubert的模型路径
hbt_model = hubert_soft(list(Path(hparams['hubert_path']).home().rglob('*.pt'))[0])
hbt_model = hubert_soft(str(list(Path(hparams['hubert_path']).home().rglob('*.pt'))[0]))
# 这个不用改,自动在根目录下所有wav的同文件夹生成其对应的npy
file_lists = list(Path(hparams['raw_data_dir']).rglob('*.wav'))
nums = len(file_lists)
count = 0
for wav_path in file_lists:
npy_path = wav_path.with_suffix(".npy")
npy_content = get_units(hbt_model, wav_path).cpu().numpy()[0]
np.save(npy_path, npy_content)
np.save(str(npy_path), npy_content)
count += 1
print(f"hubert process:{round(count * 100 / nums, 2)}%")
Loading

0 comments on commit 7c3cb12

Please sign in to comment.