add vec

jwjohns · Nov 11, 2022 · 675c726 · 675c726
1 parent 360c479
commit 675c726
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 20 deletions.
diff --git a/infer.py b/infer.py
@@ -9,6 +9,7 @@
 from infer_tools.infer_tool import Svc
 from utils.hparams import hparams
 
+
 def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
              file_path=None, out_path=None):
     if file_path is None:
@@ -24,31 +25,33 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise
     f0_tst = []
     f0_pred = []
     audio = []
-    epsilon=0.0002
+    epsilon = 0.0002
     for data in audio_data:
-        print(f'#=====segment start, {round(len(data)/audio_sr,3)}s======')
-        length=int(len(data)/audio_sr*hparams['audio_sample_rate'])
+        print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
+        length = int(len(data) / audio_sr * hparams['audio_sample_rate'])
         raw_path = io.BytesIO()
         soundfile.write(raw_path, data, audio_sr, format="wav")
         if hparams['debug']:
-            print(np.mean(data),np.var(data))
+            print(np.mean(data), np.var(data))
         raw_path.seek(0)
-        if np.var(data)<epsilon:
+        if np.var(data) < epsilon:
             print('jump empty segment')
-            _f0_tst, _f0_pred, _audio =(np.zeros(int(length/hparams['hop_size'])),np.zeros(int(length/hparams['hop_size'])),np.zeros(length))
+            _f0_tst, _f0_pred, _audio = (
+                np.zeros(int(length / hparams['hop_size'])), np.zeros(int(length / hparams['hop_size'])),
+                np.zeros(length))
         else:
             _f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, key=key, acc=acc, use_pe=use_pe, use_crepe=use_crepe,
-                                                    thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
-        fix_audio=np.zeros(length)
-        fix_audio[:]=np.mean(_audio)
-        fix_audio[:len(_audio)]=_audio
+                                                        thre=thre, use_gt_mel=use_gt_mel, add_noise_step=add_noise_step)
+        fix_audio = np.zeros(length)
+        fix_audio[:] = np.mean(_audio)
+        fix_audio[:len(_audio)] = _audio
         f0_tst.extend(_f0_tst)
         f0_pred.extend(_f0_pred)
         audio.extend(list(fix_audio))
         count += 1
     if out_path is None:
-        out_path = f'./results/{clean_name}_{key}key_{project_name}.wav'
-    soundfile.write(out_path, audio, 24000, 'PCM_16')
+        out_path = f'./results/{clean_name}_{key}key_{project_name}_{hparams["residual_channels"]}_{hparams["residual_layers"]}_{int(step / 1000)}k_{accelerate}x.wav'
+    soundfile.write(out_path, audio, hparams["audio_sample_rate"], 'PCM_16')
     return np.array(f0_tst), np.array(f0_pred), audio
 
 
@@ -64,7 +67,8 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise
     # 加速倍数
     accelerate = 20
     hubert_gpu = True
-    cut_time = 30
+
+    step = int(model_path.split("_")[-1].split(".")[0])
 
     # 下面不动
     infer_tool.mkdir(["./raw", "./results"])

diff --git a/network/hubert/vec_model.py b/network/hubert/vec_model.py
@@ -0,0 +1,59 @@
+from pathlib import Path
+
+import librosa
+import numpy as np
+import torch
+from fairseq import checkpoint_utils
+
+
+def load_model(vec_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("load model(s) from {}".format(vec_path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model = model.to(device)
+    model.eval()
+    return model
+
+
+def get_vec_units(con_model, audio_path, dev):
+    audio, sampling_rate = librosa.load(audio_path)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+
+    feats = torch.from_numpy(audio).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(dev),
+        "padding_mask": padding_mask.to(dev),
+        "output_layer": 9,  # layer 9
+    }
+    with torch.no_grad():
+        logits = con_model.extract_features(**inputs)
+        feats = con_model.final_proj(logits[0])
+    return feats
+
+
+if __name__ == '__main__':
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_path = "../../checkpoints/checkpoint_best_legacy_500.pt"  # checkpoint_best_legacy_500.pt
+    vec_model = load_model(model_path)
+    # 这个不用改，自动在根目录下所有wav的同文件夹生成其对应的npy
+    file_lists = list(Path("../../data/vecfox").rglob('*.wav'))
+    nums = len(file_lists)
+    count = 0
+    for wav_path in file_lists:
+        npy_path = wav_path.with_suffix(".npy")
+        npy_content = get_vec_units(vec_model, str(wav_path), device).cpu().numpy()[0]
+        np.save(str(npy_path), npy_content)
+        count += 1
+        print(f"hubert process：{round(count * 100 / nums, 2)}%")
diff --git a/preprocessing/hubertinfer.py b/preprocessing/hubertinfer.py
@@ -1,22 +1,29 @@
 import os.path
-from pathlib import Path
 from io import BytesIO
+from pathlib import Path
+
 import numpy as np
 import torch
 
 from network.hubert.hubert_model import hubert_soft, get_units
+from network.hubert.vec_model import load_model, get_vec_units
 from utils.hparams import hparams
 
 
 class Hubertencoder():
     def __init__(self, pt_path='checkpoints/hubert/hubert_soft.pt'):
-        pt_path = list(Path(pt_path).parent.rglob('*.pt'))[0]
-        if 'hubert_gpu' in hparams.keys():
-            self.use_gpu = hparams['hubert_gpu']
+        if hparams['use_vec']:
+            pt_path = "checkpoints/vec/checkpoint_best_legacy_500.pt"
+            self.dev = torch.device("cuda")
+            self.hbt_model = load_model(pt_path)
         else:
-            self.use_gpu = True
-        self.dev = torch.device("cuda" if self.use_gpu and torch.cuda.is_available() else "cpu")
-        self.hbt_model = hubert_soft(str(pt_path))
+            pt_path = list(Path(pt_path).parent.rglob('*.pt'))[0]
+            if 'hubert_gpu' in hparams.keys():
+                self.use_gpu = hparams['hubert_gpu']
+            else:
+                self.use_gpu = True
+            self.dev = torch.device("cuda" if self.use_gpu and torch.cuda.is_available() else "cpu")
+            self.hbt_model = hubert_soft(str(pt_path))
 
     def encode(self, wav_path):
         if isinstance(wav_path, BytesIO):
@@ -26,6 +33,8 @@ def encode(self, wav_path):
             npy_path = Path(wav_path).with_suffix('.npy')
         if os.path.exists(npy_path):
             units = np.load(str(npy_path))
+        elif hparams['use_vec']:
+            units = get_vec_units(self.hbt_model, wav_path, self.dev).cpu().numpy()[0]
         else:
             units = get_units(self.hbt_model, wav_path, self.dev).cpu().numpy()[0]
         return units  # [T,256]
diff --git a/simplify.py b/simplify.py
@@ -0,0 +1,28 @@
+from argparse import ArgumentParser
+
+import torch
+
+
+def simplify_pth(pth_name, project_name):
+    model_path = f'./checkpoints/{project_name}'
+    checkpoint_dict = torch.load(f'{model_path}/{pth_name}')
+    torch.save({'epoch': checkpoint_dict['epoch'],
+                'state_dict': checkpoint_dict['state_dict'],
+                'global_step': None,
+                'checkpoint_callback_best': None,
+                'optimizer_states': None,
+                'lr_schedulers': None
+                }, f'./clean_{pth_name}')
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('--proj', type=str)
+    parser.add_argument('--steps', type=str)
+    args = parser.parse_args()
+    model_name = f"model_ckpt_steps_{args.steps}.ckpt"
+    simplify_pth(model_name, args.proj)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/training/config.yaml b/training/config.yaml
@@ -334,6 +334,7 @@ use_spk_embed: false
 use_spk_id: false
 use_split_spk_id: false
 use_uv: true
+use_vec: false
 use_var_enc: false
 val_check_interval: 2000
 valid_num: 0

diff --git a/training/config_nsf.yaml b/training/config_nsf.yaml
@@ -176,6 +176,7 @@ use_spk_embed: false
 use_spk_id: false
 use_split_spk_id: false
 use_uv: true
+use_vec: false
 use_var_enc: false
 val_check_interval: 2000
 valid_num: 0

diff --git a/trans_key.py b/trans_key.py
@@ -0,0 +1,61 @@
+head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+
+
+def trans_f0_seq(feature_pit, transform):
+    feature_pit = feature_pit * 2 ** (transform / 12)
+    return round(feature_pit, 1)
+
+
+def move_key(raw_data, mv_key):
+    head = raw_data[:-1]
+    body = int(raw_data[-1])
+    new_head_index = head_list.index(head) + mv_key
+    while new_head_index < 0:
+        body -= 1
+        new_head_index += 12
+    while new_head_index > 11:
+        body += 1
+        new_head_index -= 12
+    result_data = head_list[new_head_index] + str(body)
+    return result_data
+
+
+def trans_key(raw_data, key):
+    for i in raw_data:
+        note_seq_list = i["note_seq"].split(" ")
+        new_note_seq_list = []
+        for note_seq in note_seq_list:
+            if note_seq != "rest":
+                new_note_seq = move_key(note_seq, key)
+                new_note_seq_list.append(new_note_seq)
+            else:
+                new_note_seq_list.append(note_seq)
+        i["note_seq"] = " ".join(new_note_seq_list)
+
+        f0_seq_list = i["f0_seq"].split(" ")
+        f0_seq_list = [float(x) for x in f0_seq_list]
+        new_f0_seq_list = []
+        for f0_seq in f0_seq_list:
+            new_f0_seq = trans_f0_seq(f0_seq, key)
+            new_f0_seq_list.append(str(new_f0_seq))
+        i["f0_seq"] = " ".join(new_f0_seq_list)
+    return raw_data
+
+
+key = -6
+f_w = open("raw.txt", "w", encoding='utf-8')
+with open("result.txt", "r", encoding='utf-8') as f:
+    raw_data = f.readlines()
+    for raw in raw_data:
+        raw_list = raw.split("|")
+        new_note_seq_list = []
+        for note_seq in raw_list[3].split(" "):
+            if note_seq != "rest":
+                note_seq = note_seq.split("/")[0] if "/" in note_seq else note_seq
+                new_note_seq = move_key(note_seq, key)
+                new_note_seq_list.append(new_note_seq)
+            else:
+                new_note_seq_list.append(note_seq)
+        raw_list[3] = " ".join(new_note_seq_list)
+        f_w.write("|".join(raw_list))
+f_w.close()