forked from dakenan1/Speech-measure-SDR-SAR-STOI-PESQ
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpesq.m
126 lines (95 loc) · 3.79 KB
/
pesq.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
function [pesq_mos]= pesq(ref_wav, deg_wav)
% ----------------------------------------------------------------------
% PESQ objective speech quality measure
%
% This function implements the PESQ measure based on the ITU standard
% P.862 [1].
%
%
% Usage: pval=pesq(cleanFile.wav, enhancedFile.wav)
%
% cleanFile.wav - clean input file in .wav format
% enhancedFile - enhanced output file in .wav format
% pval - PESQ value
%
% Note that the PESQ routine only supports sampling rates of 8 kHz and
% 16 kHz [1]
%
% Example call: pval = pesq ('sp04.wav','enhanced.wav')
%
%
% References:
% [1] ITU (2000). Perceptual evaluation of speech quality (PESQ), and
% objective method for end-to-end speech quality assessment of
% narrowband telephone networks and speech codecs. ITU-T
% Recommendation P. 862
%
% Authors: Yi Hu and Philipos C. Loizou
%
%
% Copyright (c) 2006 by Philipos C. Loizou
% $Revision: 0.0 $ $Date: 10/09/2006 $
% ----------------------------------------------------------------------
if nargin<2
fprintf('Usage: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) \n');
return;
end;
global Downsample DATAPADDING_MSECS SEARCHBUFFER Fs WHOLE_SIGNAL
global Align_Nfft Window
[ref_data,sampling_rate]= audioread( ref_wav);
if sampling_rate~=8000 && sampling_rate~=16000
error('Sampling frequency needs to be either 8000 or 16000 Hz');
end
setup_global( sampling_rate);
% Window= hann( Align_Nfft, 'periodic'); %Hanning window
% Window= Window';
TWOPI= 6.28318530717959;
%for count = 0: Align_Nfft- 1
% Window(1+ count) = 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
%end
count=0:Align_Nfft- 1;
Window= 0.5 * (1.0 - cos((TWOPI * count) / Align_Nfft));
ref_data= ref_data';
ref_data= ref_data* 32768;
ref_Nsamples= length( ref_data)+ 2* SEARCHBUFFER* Downsample;
ref_data= [zeros( 1, SEARCHBUFFER* Downsample), ref_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
[deg_data,~]= audioread( deg_wav);
deg_data= deg_data';
deg_data= deg_data* 32768;
deg_Nsamples= length( deg_data)+ 2* SEARCHBUFFER* Downsample;
deg_data= [zeros( 1, SEARCHBUFFER* Downsample), deg_data, ...
zeros( 1, DATAPADDING_MSECS* (Fs/ 1000)+ SEARCHBUFFER* Downsample)];
maxNsamples= max( ref_Nsamples, deg_Nsamples);
ref_data= fix_power_level( ref_data, ref_Nsamples, maxNsamples);
deg_data= fix_power_level( deg_data, deg_Nsamples, maxNsamples);
standard_IRS_filter_dB= [0, -200; 50, -40; 100, -20; 125, -12; 160, -6; 200, 0;...
250, 4; 300, 6; 350, 8; 400, 10; 500, 11; 600, 12; 700, 12; 800, 12;...
1000, 12; 1300, 12; 1600, 12; 2000, 12; 2500, 12; 3000, 12; 3250, 12;...
3500, 4; 4000, -200; 5000, -200; 6300, -200; 8000, -200];
ref_data= apply_filter( ref_data, ref_Nsamples, standard_IRS_filter_dB);
deg_data= apply_filter( deg_data, deg_Nsamples, standard_IRS_filter_dB);
%
% for later use in psychoacoustical model
model_ref= ref_data;
model_deg= deg_data;
[ref_data, deg_data]= input_filter( ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples);
[ref_VAD, ref_logVAD]= apply_VAD( ref_data, ref_Nsamples);
[deg_VAD, deg_logVAD]= apply_VAD( deg_data, deg_Nsamples);
crude_align (ref_logVAD, ref_Nsamples, deg_logVAD, deg_Nsamples,...
WHOLE_SIGNAL);
utterance_locate (ref_data, ref_Nsamples, ref_VAD, ref_logVAD,...
deg_data, deg_Nsamples, deg_VAD, deg_logVAD);
ref_data= model_ref;
deg_data= model_deg;
% make ref_data and deg_data equal length
if (ref_Nsamples< deg_Nsamples)
newlen= deg_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
ref_data( newlen)= 0;
elseif (ref_Nsamples> deg_Nsamples)
newlen= ref_Nsamples+ DATAPADDING_MSECS* (Fs/ 1000);
deg_data( newlen)= 0;
end
pesq_mos= pesq_psychoacoustic_model (ref_data, ref_Nsamples, deg_data, ...
deg_Nsamples );