-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsdr-webUI.py
194 lines (152 loc) · 7.73 KB
/
sdr-webUI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import time
import numpy as np
import librosa
from scipy.signal import correlate
from mir_eval.separation import bss_eval_sources
import gradio as gr
from tabulate import tabulate
from concurrent.futures import ProcessPoolExecutor
# from colorama import just_fix_windows_console
# just_fix_windows_console()
def explain_bss_eval_sources_requirements():
return """
<h3>什么是参考音频 (reference_audio)?</h3>
<p>参考音频是从混音工程直接导出的目标声音,是我们期望的分离后理想的音频结果。</p>
<ul>
<li><strong>人声分离模型</strong>:参考音频应该是没有伴奏的人声。</li>
<li><strong>和声分离模型</strong>:参考音频应该是没有伴奏的主唱人声,不包含和声。</li>
<li><strong>混响分离模型</strong>:参考音频应该是没有混响的干净人声。</li>
<li><strong>降噪模型</strong>:参考音频应该是没有噪声的干净的目标声音。</li>
</ul>
<h3>什么是估计音频 (estimated_audio)?</h3>
<p>估计音频是分离模型输出的音频结果,用来计算和理想的参考音频相似程度以确定分离质量。</p>
<ul>
<li><strong>人声分离模型</strong>:通常输入模型的是整首歌,取其分离结果。</li>
<li><strong>和声分离模型</strong>:通常输入模型的是包含和声的人声,取其分离结果。</li>
<li><strong>混响分离模型</strong>:通常输入模型的是带有混响的人声,取其分离结果。</li>
<li><strong>降噪模型</strong>:通常输入模型的是带有噪声的目标声音。</li>
</ul>
<h3>其他重要提示</h3>
<ul>
<li><strong>保证音频质量</strong>:参考音频和输入分离模型的音频不可使用分离出的音频。</li>
<li><strong>保证音频对应</strong>:参考音频和输入分离模型的音频应来自同一个混音工程的不同导出。</li>
</ul>
"""
def print_bss_eval_descriptions():
return """
<h3>各参数含义</h3>
<ol>
<li><strong>SDR (Source-to-Distortion Ratio) - 信号失真比</strong>
<p><br> 含义: 衡量源信号的总失真程度,SDR 值越大,分离后的信号越接近原始信号,表示分离质量越好。</p></li>
<li><strong>SIR (Source-to-Interference Ratio) - 信号干扰比</strong>
<p><br> 含义: 衡量目标源与干扰源的分离效果,SIR 值越大,说明对干扰源的抑制越好,如分离人声时器乐分离效果越好。</p></li>
<li><strong>SAR (Source-to-Artifacts Ratio) - 信号伪影比</strong>
<p><br> 含义: 衡量分离信号中由算法引入的伪影程度,SAR 值越大,说明算法引入的伪影越少,分离结果越好。</p></li>
</ol>
"""
def read_and_resample_audio(ref_path, est_paths):
ref_audio, ref_sr = librosa.load(ref_path, sr=None, mono=False)
est_audios = []
est_srs = []
for est_path in est_paths:
est_audio, est_sr = librosa.load(est_path, sr=None, mono=False)
est_audios.append(est_audio)
est_srs.append(est_sr)
target_sr = max([ref_sr] + est_srs)
if ref_sr != target_sr:
ref_audio = librosa.resample(ref_audio, orig_sr=ref_sr, target_sr=target_sr)
for i in range(len(est_audios)):
if est_srs[i] != target_sr:
est_audios[i] = librosa.resample(est_audios[i], orig_sr=est_srs[i], target_sr=target_sr)
ref_audio = ref_audio if ref_audio.ndim == 2 else np.vstack([ref_audio] * 2)
est_audios = [audio if audio.ndim == 2 else np.vstack([audio] * 2) for audio in est_audios]
return ref_audio, est_audios, target_sr
def align_audio(ref_audio, est_audio, sample_rate, est_path, window_size=None):
est_name = os.path.basename(est_path)
ref_channel = ref_audio[0] if ref_audio.ndim == 2 else ref_audio
est_channel = est_audio[0] if est_audio.ndim == 2 else est_audio
if window_size is None:
window_size = 60 * sample_rate
if len(ref_channel) > window_size:
ref_channel = ref_channel[:window_size]
if len(est_channel) > window_size:
est_channel = est_channel[:window_size]
correlation = correlate(est_channel, ref_channel, mode='full')
lag = np.argmax(correlation) - len(ref_channel) + 1
if lag > 0:
aligned_ref_audio = ref_audio[:, :len(ref_audio[0]) - lag]
aligned_est_audio = est_audio[:, lag:]
elif lag < 0:
aligned_ref_audio = ref_audio[:, -lag:]
aligned_est_audio = est_audio[:, :len(est_audio[0]) + lag]
else:
aligned_ref_audio = ref_audio
aligned_est_audio = est_audio
min_len = min(len(aligned_ref_audio[0]), len(aligned_est_audio[0]))
aligned_ref_audio = aligned_ref_audio[:, :min_len]
aligned_est_audio = aligned_est_audio[:, :min_len]
return aligned_ref_audio, aligned_est_audio, est_name
def compute_sdr(ref_audio, est_audio, est_name):
sdr, sir, sar, _ = bss_eval_sources(ref_audio, est_audio)
return sdr, sir, sar, est_name
def process_audio(ref_path, est_paths, num_workers=4):
results = []
futures = []
ref_audio, est_audios, max_sr = read_and_resample_audio(ref_path, est_paths)
with ProcessPoolExecutor(max_workers=num_workers) as executor:
for i, est_audio in enumerate(est_audios):
aligned_ref_audio, aligned_est_audio, est_name = align_audio(ref_audio, est_audio, max_sr, est_paths[i])
future = executor.submit(compute_sdr, aligned_ref_audio, aligned_est_audio, est_name)
futures.append(future)
for future in futures:
try:
sdr, sir, sar, est_name = future.result()
results.append((est_name, sdr, sir, sar))
except Exception as exc:
print(f"Error: {exc}")
return results
def print_estimation_results(results):
best_sdr_idx = np.argmax([np.mean(result[1]) for result in results])
table_data = []
for idx, (est_name, sdr, sir, sar) in enumerate(results):
row = [
est_name,
f"{sdr[0]:.2f} / {sdr[1]:.2f}",
f"{sir[0]:.2f} / {sir[1]:.2f}",
f"{sar[0]:.2f} / {sar[1]:.2f}"
]
table_data.append(row)
headers = ["Estimated Audio", "SDR (L / R)", "SIR (L / R)", "SAR (L / R)"]
table = tabulate(table_data, headers=headers, tablefmt="html")
return table
def sdr_eval_ui(ref_audio_file, est_audio_files):
ref_path = ref_audio_file.name
est_paths = [f.name for f in est_audio_files]
results = process_audio(ref_path, est_paths)
result_table = print_estimation_results(results)
return result_table
with gr.Blocks() as demo:
gr.Markdown("# SDR计算小工具 v0.1")
gr.Markdown("#### GitHub 地址: https://github.com/AliceNavigator/sdr-eval | by 领航员未鸟")
with gr.Row():
with gr.Column(scale=1):
gr.HTML(explain_bss_eval_sources_requirements())
gr.HTML("<br>")
with gr.Column(scale=1):
gr.HTML("<hr>")
gr.HTML(print_bss_eval_descriptions())
gr.Markdown("### 请上传音频文件进行SDR评估")
with gr.Row():
with gr.Column():
gr.Markdown("#### 上传参考音频 (Reference Audio):")
ref_audio = gr.File(label="Upload Reference Audio", file_types=["audio"])
with gr.Column():
gr.Markdown("#### 上传估计音频 (Estimated Audio):")
est_audio = gr.File(label="Upload Estimated Audio", file_count="multiple", file_types=["audio"])
run_btn = gr.Button("Run SDR Evaluation")
gr.Markdown("### 评估结果")
result_display = gr.HTML()
run_btn.click(fn=sdr_eval_ui, inputs=[ref_audio, est_audio], outputs=result_display)
if __name__ == "__main__":
demo.launch(inbrowser=True)