Merge pull request prophesier#2 from IceKyrin/main

long audio process in memory
Phantasmoz · Nov 1, 2022 · 7c3cb12 · 7c3cb12
2 parents 64b25c2 + 9bd507c
commit 7c3cb12
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 176 deletions.
diff --git a/infer.py b/infer.py
@@ -1,67 +1,65 @@
-import logging
+import io
+import os
 from pathlib import Path
-import soundfile
+
 import numpy as np
+import soundfile
+
 from infer_tools import infer_tool
-# from infer_tools import merge
+from infer_tools import slicer
 from infer_tools.infer_tool import Svc
 
-def run_clip(svc_model,key, acc, use_pe, use_crepe,thre, use_gt_mel, add_noise_step,project_name='',f_name=None,file_path=None,out_path=None):
-    infer_tool.mkdir(["./raw", "./results"])
-    input_wav_path = "./infer_tools/wav_temp/input"
-    #out_wav_path = "./infer_tools/wav_temp/output"
-    cut_time = 30
-    infer_tool.mkdir(["./infer_tools/wav_temp", input_wav_path])
-    infer_tool.del_temp_wav(input_wav_path)
-    if file_path==None:
+
+def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
+             file_path=None, out_path=None):
+    if file_path is None:
         raw_audio_path = f"./raw/{f_name}"
         clean_name = f_name[:-4]
     else:
-        raw_audio_path=file_path
+        raw_audio_path = file_path
         clean_name = str(Path(file_path).name)[:-4]
     infer_tool.format_wav(raw_audio_path)
-    infer_tool.del_temp_wav("./infer_tools/wav_temp")
-    infer_tool.cut_wav(Path(raw_audio_path).with_suffix('.wav'), clean_name, input_wav_path, cut_time)
+    audio_data, audio_sr = slicer.cut(Path(raw_audio_path).with_suffix('.wav'))
 
     count = 0
-    file_list = infer_tool.get_end_file(input_wav_path, "wav")
-    f0_tst=[]
-    f0_pred=[]
-    audio=[]
-    for file_name in file_list:
-        file_name = file_name.split("/")[-1]
-        raw_path = f"{input_wav_path}/{file_name}"
-        #out_path = f"{out_wav_path}/{file_name}"
-
+    f0_tst = []
+    f0_pred = []
+    audio = []
+    for data in audio_data:
+        raw_path = io.BytesIO()
+        soundfile.write(raw_path, data, audio_sr, format="wav")
+        raw_path.seek(0)
         _f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, key=key, acc=acc, use_pe=use_pe, use_crepe=use_crepe,
-                                                 thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
+                                                    thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
         f0_tst.extend(_f0_tst)
         f0_pred.extend(_f0_pred)
         audio.extend(list(_audio))
         count += 1
-    if out_path==None:
+    if out_path is None:
         out_path = f'./results/{clean_name}_{key}key_{project_name}.wav'
-    soundfile.write(out_path,audio,24000,'PCM_16')
-    # 清除缓存文件
-    infer_tool.del_temp_wav(input_wav_path)
-    return np.array(f0_tst),np.array(f0_pred),audio
+    soundfile.write(out_path, audio, 24000, 'PCM_16')
+    return np.array(f0_tst), np.array(f0_pred), audio
 
-if __name__=='__main__':
-    logging.getLogger('numba').setLevel(logging.WARNING)
 
+if __name__ == '__main__':
     # 工程文件夹名，训练时用的那个
-    project_name = "nyaru"
-    model_path = f'./DiffSingerRM/checkpoints/{project_name}/model_ckpt_steps_112000.ckpt'
+    project_name = "yilanqiu"
+    model_path = f'./checkpoints/{project_name}/model_ckpt_steps_246000.ckpt'
     config_path = f'./checkpoints/{project_name}/config.yaml'
-    # 支持多个wav文件，放在raw文件夹下
-    file_names = ["群青.wav"]
-    trans = [0]  # 音高调整，支持正负（半音）
+
+    # 支持多个wav/ogg文件，放在raw文件夹下，带扩展名
+    file_names = ["青花瓷.wav"]
+    trans = [0]  # 音高调整，支持正负（半音），数量与上一行对应，不足的自动按第一个移调参数补齐
     # 加速倍数
     accelerate = 20
-    hubert_gpu=True
-    infer_tool.fill_a_to_b(trans, file_names)
+    hubert_gpu = True
+    cut_time = 30
+
     # 下面不动
-    model = Svc(project_name,config_path,hubert_gpu, model_path)
+    infer_tool.mkdir(["./raw", "./results"])
+    infer_tool.fill_a_to_b(trans, file_names)
+
+    model = Svc(project_name, config_path, hubert_gpu, model_path)
     for f_name, tran in zip(file_names, trans):
-        run_clip(model,key=tran,acc=accelerate,use_crepe=True,thre=0.05,use_pe=True,use_gt_mel=False,add_noise_step=500,f_name=f_name,project_name=project_name)
-
+        run_clip(model, key=tran, acc=accelerate, use_crepe=True, thre=0.05, use_pe=True, use_gt_mel=False,
+                 add_noise_step=500, f_name=f_name, project_name=project_name)
diff --git a/infer_tools/infer_tool.py b/infer_tools/infer_tool.py
@@ -1,30 +1,22 @@
-import logging
 import os
-import shutil
-import subprocess
 import time
 
 import librosa
 import numpy as np
 import soundfile
 import torch
-import torchaudio
 
 import utils
+from io import BytesIO
 from modules.fastspeech.pe import PitchExtractor
 from network.diff.candidate_decoder import FFT
 from network.diff.diffusion import GaussianDiffusion
 from network.diff.net import DiffNet
-from network.vocoders.base_vocoder import VOCODERS
-from network.vocoders.base_vocoder import get_vocoder_cls
+from network.vocoders.base_vocoder import VOCODERS, get_vocoder_cls
 from preprocessing.data_gen_utils import get_pitch_parselmouth, get_pitch_crepe
 from preprocessing.hubertinfer import Hubertencoder
-from utils.hparams import hparams
-from utils.hparams import set_hparams
-from utils.pitch_utils import denorm_f0
-from utils.pitch_utils import norm_interp_f0
-
-logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from utils.hparams import hparams, set_hparams
+from utils.pitch_utils import denorm_f0, norm_interp_f0
 
 
 def timeit(func):
@@ -42,32 +34,6 @@ def format_wav(audio_path):
     soundfile.write(audio_path[:-4] + ".wav", raw_audio, raw_sample_rate)
 
 
-def cut_wav(raw_audio_path, out_audio_name, input_wav_path, cut_time):
-    raw_audio, raw_sr = torchaudio.load(raw_audio_path)
-    if raw_audio.shape[-1] / raw_sr > cut_time:
-        subprocess.Popen(
-            f"python ./infer_tools/slicer.py {raw_audio_path} --out_name {out_audio_name} --out {input_wav_path}  --db_thresh -30",
-            shell=True).wait()
-    else:
-        shutil.copy(raw_audio_path, f"{input_wav_path}/{out_audio_name}-00.wav")
-
-
-def get_end_file(dir_path, end):
-    file_lists = []
-    for root, dirs, files in os.walk(dir_path):
-        files = [f for f in files if f[0] != '.']
-        dirs[:] = [d for d in dirs if d[0] != '.']
-        for f_file in files:
-            if f_file.endswith(end):
-                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
-    return file_lists
-
-
-def del_temp_wav(path_data):
-    for i in get_end_file(path_data, "wav"):  # os.listdir(path_data)#返回一个列表，里面是当前目录下面的所有东西的相对路径
-        os.remove(i)
-
-
 def fill_a_to_b(a, b):
     if len(a) < len(b):
         for _ in range(0, len(b) - len(a)):
@@ -81,7 +47,8 @@ def mkdir(paths: list):
 
 
 class Svc:
-    def __init__(self, project_name,config_name,hubert_gpu, model_path):
+    def __init__(self, project_name, config_name, hubert_gpu, model_path):
+        self.project_name = project_name
         self.DIFF_DECODERS = {
             'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
             'fft': lambda hp: FFT(
@@ -91,7 +58,7 @@ def __init__(self, project_name,config_name,hubert_gpu, model_path):
         self.model_path = model_path
         self.dev = torch.device("cuda")
 
-        self._ = set_hparams(config=config_name, exp_name=project_name, infer=True,
+        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
                              reset=True,
                              hparams_str='',
                              print_hparams=False)
@@ -107,8 +74,8 @@ def __init__(self, project_name,config_name,hubert_gpu, model_path):
         )
         self.load_ckpt()
         self.model.cuda()
-        hparams['hubert_gpu']=hubert_gpu
-        hparams['use_uv']=True
+        hparams['hubert_gpu'] = hubert_gpu
+        hparams['use_uv'] = True
         self.hubert = Hubertencoder(hparams['hubert_path'])
         self.pe = PitchExtractor().cuda()
         utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
@@ -136,7 +103,7 @@ def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, **kwa
         batch['mel2ph_pred'] = outputs['mel2ph']
         batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
         if use_pe:
-            hparams['use_uv']=True
+            hparams['use_uv'] = True
             batch['f0_pred'] = self.pe(outputs['mel_out'])['f0_denorm_pred'].detach()
         else:
             batch['f0_pred'] = outputs.get('f0_denorm')
@@ -151,21 +118,12 @@ def after_infer(self, prediction):
         # remove paddings
         mel_gt = prediction["mels"]
         mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
-        mel_gt = mel_gt[mel_gt_mask]
-        mel2ph_gt = prediction.get("mel2ph")
-        mel2ph_gt = mel2ph_gt[mel_gt_mask] if mel2ph_gt is not None else None
+
         mel_pred = prediction["outputs"]
         mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
         mel_pred = mel_pred[mel_pred_mask]
-        mel_gt = np.clip(mel_gt, hparams['mel_vmin'], hparams['mel_vmax'])
         mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
 
-        mel2ph_pred = prediction.get("mel2ph_pred")
-        if mel2ph_pred is not None:
-            if len(mel2ph_pred) > len(mel_pred_mask):
-                mel2ph_pred = mel2ph_pred[:len(mel_pred_mask)]
-            mel2ph_pred = mel2ph_pred[mel_pred_mask]
-
         f0_gt = prediction.get("f0_gt")
         f0_pred = prediction.get("f0_pred")
         if f0_pred is not None:
@@ -183,6 +141,7 @@ def temporary_dict2processed_input(self, item_name, temp_dict, use_crepe=True, t
         '''
 
         binarization_args = hparams['binarization_args']
+
         @timeit
         def get_pitch(wav, mel):
             # get ground truth f0 by self.get_pitch_algorithm
@@ -230,27 +189,20 @@ def get_align(mel, phone_encoded):
                 get_align(mel, hubert_encoded)
         return processed_input
 
-    def pre(self, in_path, accelerate, use_crepe=True, thre=0.05):
-        temp_dict = self.temporary_dict2processed_input(*file2temporary_dict(in_path), use_crepe, thre)
+    def pre(self, wav_fn, accelerate, use_crepe=True, thre=0.05):
+        if isinstance(wav_fn, BytesIO):
+            item_name = self.project_name
+        else:
+            song_info = wav_fn.split('/')
+            item_name = song_info[-1].split('.')[-2]
+        temp_dict = {'wav_fn': wav_fn, 'spk_id': self.project_name}
+
+        temp_dict = self.temporary_dict2processed_input(item_name, temp_dict, use_crepe, thre)
         hparams['pndm_speedup'] = accelerate
         batch = processed_input2batch([getitem(temp_dict)])
         return batch
 
 
-def file2temporary_dict(wav_fn):
-    '''
-        read from file, store data in temporary dicts
-    '''
-    song_info = wav_fn.split('/')
-    item_name = raw_item_name = song_info[-1].split('.')[-2]
-    temp_dict = {}
-
-    temp_dict['wav_fn'] = wav_fn
-    temp_dict['spk_id'] = 'opencpop'
-
-    return item_name, temp_dict
-
-
 def getitem(item):
     max_frames = hparams['max_frames']
     spec = torch.Tensor(item['mel'])[:max_frames]

diff --git a/infer_tools/slicer.py b/infer_tools/slicer.py
@@ -162,5 +162,31 @@ def main():
                         audio[start:len(audio)], sr)
 
 
+def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
+    audio, sr = torchaudio.load(audio_path)
+    if len(audio.shape) == 2 and audio.shape[1] >= 2:
+        audio = torch.mean(audio, dim=0).unsqueeze(0)
+    audio = audio.cpu().numpy()[0]
+
+    slicer = Slicer(
+        sr=sr,
+        db_threshold=db_thresh,
+        min_length=min_len,
+        win_l=win_l,
+        win_s=win_s,
+        max_silence_kept=max_sil_kept
+    )
+    chunks = slicer.slice(audio)
+    start = 0
+    result = []
+    for i, chunk in enumerate(chunks):
+        end = chunk
+        result.append(audio[start:end])
+        start = end
+    if start != len(audio):
+        result.append(audio[start:len(audio)])
+    return result, sr
+
+
 if __name__ == '__main__':
     main()
diff --git a/network/hubert/hubert_model.py b/network/hubert/hubert_model.py
@@ -2,6 +2,7 @@
 import os
 import random
 from typing import Optional, Tuple
+
 import librosa
 import numpy as np
 import torch
@@ -230,12 +231,7 @@ def hubert_soft(
     return hubert
 
 
-def get_units(hbt_soft, raw_wav_path,dev=torch.device('cuda')):
-    # source, sr = torchaudio.load(raw_wav_path)
-    # source = torchaudio.functional.resample(source, sr, 16000)
-    # if len(source.shape) == 2 and source.shape[1] >= 2:
-    #     source = torch.mean(source, dim=0).unsqueeze(0)
-    # source = source.unsqueeze(0).to(dev)
+def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
     wav, sr = librosa.load(raw_wav_path, sr=None)
     assert (sr >= 16000)
     if len(wav.shape) > 1:
@@ -244,7 +240,7 @@ def get_units(hbt_soft, raw_wav_path,dev=torch.device('cuda')):
         wav16 = librosa.resample(wav, sr, 16000)
     else:
         wav16 = wav
-    dev = torch.device("cuda" if (dev==torch.device('cuda') and torch.cuda.is_available()) else "cpu")
+    dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
     torch.cuda.is_available() and torch.cuda.empty_cache()
     with torch.inference_mode():
         units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
@@ -264,16 +260,17 @@ def get_end_file(dir_path, end):
 
 if __name__ == '__main__':
     from pathlib import Path
+
     dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # hubert的模型路径
-    hbt_model = hubert_soft(list(Path(hparams['hubert_path']).home().rglob('*.pt'))[0])
+    hbt_model = hubert_soft(str(list(Path(hparams['hubert_path']).home().rglob('*.pt'))[0]))
     # 这个不用改，自动在根目录下所有wav的同文件夹生成其对应的npy
     file_lists = list(Path(hparams['raw_data_dir']).rglob('*.wav'))
     nums = len(file_lists)
     count = 0
     for wav_path in file_lists:
         npy_path = wav_path.with_suffix(".npy")
         npy_content = get_units(hbt_model, wav_path).cpu().numpy()[0]
-        np.save(npy_path, npy_content)
+        np.save(str(npy_path), npy_content)
         count += 1
         print(f"hubert process：{round(count * 100 / nums, 2)}%")