Skip to content

Commit

Permalink
Merge pull request prophesier#30 from IceKyrin/main
Browse files Browse the repository at this point in the history
fix cut_wav/load_json bug
  • Loading branch information
prophesier authored Nov 30, 2022
2 parents 88c73d8 + e4149f8 commit 3edb72e
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 77 deletions.
13 changes: 6 additions & 7 deletions infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
from infer_tools.infer_tool import Svc
from utils.hparams import hparams

chunks_dict = infer_tool.read_temp("./infer_tools/chunks_temp.json")
chunks_dict = infer_tool.read_temp("./infer_tools/new_chunks_temp.json")


def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise_step, project_name='', f_name=None,
file_path=None, out_path=None,**kwargs):
file_path=None, out_path=None, slice_db=-40,**kwargs):
print(f'code version:2022-11-23 v2')
use_pe = use_pe if hparams['audio_sample_rate'] == 24000 else False
if file_path is None:
Expand All @@ -33,25 +33,24 @@ def run_clip(svc_model, key, acc, use_pe, use_crepe, thre, use_gt_mel, add_noise
print("load chunks from temp")
chunks = chunks_dict[wav_hash]["chunks"]
else:
chunks = slicer.cut(wav_path)
chunks = slicer.cut(wav_path, db_thresh=slice_db)
chunks_dict[wav_hash] = {"chunks": chunks, "time": int(time.time())}
infer_tool.write_temp("./infer_tools/chunks_temp.json", chunks_dict)
infer_tool.write_temp("./infer_tools/new_chunks_temp.json", chunks_dict)
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)

count = 0
f0_tst = []
f0_pred = []
audio = []
epsilon = 0.00002
for data in audio_data:
for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
raw_path = io.BytesIO()
soundfile.write(raw_path, data, audio_sr, format="wav")
if hparams['debug']:
print(np.mean(data), np.var(data))
raw_path.seek(0)
if np.var(data) < epsilon:
if slice_tag:
print('jump empty segment')
_f0_tst, _f0_pred, _audio = (
np.zeros(int(np.ceil(length / hparams['hop_size']))), np.zeros(int(np.ceil(length / hparams['hop_size']))),
Expand Down
28 changes: 18 additions & 10 deletions infer_tools/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import os
import time
from io import BytesIO
from pathlib import Path

import librosa
import numpy as np
import soundfile
import torch

import utils
from pathlib import Path
from modules.fastspeech.pe import PitchExtractor
from network.diff.candidate_decoder import FFT
from network.diff.diffusion import GaussianDiffusion
Expand All @@ -21,22 +21,30 @@
from utils.hparams import hparams, set_hparams
from utils.pitch_utils import denorm_f0, norm_interp_f0

if os.path.exists("chunks_temp.json"):
os.remove("chunks_temp.json")


def read_temp(file_name):
if not os.path.exists(file_name):
with open(file_name, "w") as f:
f.write(json.dumps({"info": "temp_dict"}))
return {}
else:
with open(file_name, "r") as f:
data = f.read()
data_dict = json.loads(data)
if os.path.getsize(file_name) > 50 * 1024 * 1024:
f_name = file_name.split("/")[-1]
print(f"clean {f_name}")
for wav_hash in list(data_dict.keys()):
if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
del data_dict[wav_hash]
try:
with open(file_name, "r") as f:
data = f.read()
data_dict = json.loads(data)
if os.path.getsize(file_name) > 50 * 1024 * 1024:
f_name = file_name.split("/")[-1]
print(f"clean {f_name}")
for wav_hash in list(data_dict.keys()):
if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
del data_dict[wav_hash]
except Exception as e:
print(e)
print(f"{file_name} error,auto rebuild file")
data_dict = {"info": "temp_dict"}
return data_dict


Expand Down
80 changes: 20 additions & 60 deletions infer_tools/slicer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import os.path
import time
from argparse import ArgumentParser

import numpy as np
import soundfile
import torch
import torchaudio
from scipy.ndimage import maximum_filter1d, uniform_filter1d
Expand Down Expand Up @@ -107,59 +104,25 @@ def slice(self, audio):
split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
sil_tags.append((split_loc_l, samples.shape[0]))
if len(sil_tags) == 0:
return [len(audio)]
return {0: {"slice": False, "split_time": (0, len(audio))}}
else:
chunks = []
# 第一段静音并非从头开始,补上有声片段
if sil_tags[0][0]:
chunks.append({"slice": False, "split_time": f"0,{sil_tags[0][0]}"})
for i in range(0, len(sil_tags)):
chunks.append(int((sil_tags[i][0] + sil_tags[i][1]) / 2))
return chunks


def main():
parser = ArgumentParser()
parser.add_argument('audio', type=str, help='The audio to be sliced')
parser.add_argument('--out_name', type=str, help='Output directory of the sliced audio clips')
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
parser.add_argument('--db_thresh', type=float, required=False, default=-40,
help='The dB threshold for silence detection')
parser.add_argument('--min_len', type=int, required=False, default=5000,
help='The minimum milliseconds required for each sliced audio clip')
parser.add_argument('--win_l', type=int, required=False, default=300,
help='Size of the large sliding window, presented in milliseconds')
parser.add_argument('--win_s', type=int, required=False, default=20,
help='Size of the small sliding window, presented in milliseconds')
parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
help='The maximum silence length kept around the sliced audio, presented in milliseconds')
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = torchaudio.load(args.audio)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = torch.mean(audio, dim=0).unsqueeze(0)
audio = audio.cpu().numpy()[0]

slicer = Slicer(
sr=sr,
db_threshold=args.db_thresh,
min_length=args.min_len,
win_l=args.win_l,
win_s=args.win_s,
max_silence_kept=args.max_sil_kept
)
chunks = slicer.slice(audio)
if not os.path.exists(args.out):
os.makedirs(args.out)
start = 0
end_id = 0
for i, chunk in enumerate(chunks):
end = chunk
soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(i).zfill(2))), audio[start:end], sr)
start = end
end_id = i + 1
if start != len(audio):
soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(end_id).zfill(2))),
audio[start:len(audio)], sr)
# 标识有声片段(跳过第一段)
if i:
chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1]},{sil_tags[i][0]}"})
# 标识所有静音片段
chunks.append({"slice": True, "split_time": f"{sil_tags[i][0]},{sil_tags[i][1]}"})
# 最后一段静音并非结尾,补上结尾片段
if sil_tags[-1][1] != len(audio):
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1]},{len(audio)}"})
chunk_dict = {}
for i in range(len(chunks)):
chunk_dict[str(i)] = chunks[i]
return chunk_dict


def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
Expand All @@ -181,18 +144,15 @@ def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_ke


def chunks2audio(audio_path, chunks):
chunks = dict(chunks)
audio, sr = torchaudio.load(audio_path)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = torch.mean(audio, dim=0).unsqueeze(0)
audio = audio.cpu().numpy()[0]
start = 0
result = []
for i, chunk in enumerate(chunks):
end = chunk
result.append(audio[start:end])
start = end
if start != len(audio):
result.append(audio[start:len(audio)])
for k, v in chunks.items():
tag = v["split_time"].split(",")
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
return result, sr


Expand Down

0 comments on commit 3edb72e

Please sign in to comment.