From f0f7a000f17101ce58a960e63b411402f84971b6 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 1 Apr 2024 03:02:40 -0400 Subject: [PATCH] Sequence level normalization --- src/dump_features.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/dump_features.c b/src/dump_features.c index 120c21b3..5f9630ef 100644 --- a/src/dump_features.c +++ b/src/dump_features.c @@ -43,6 +43,7 @@ int lowpass = FREQ_SIZE; int band_lp = NB_BANDS; #define SEQUENCE_LENGTH 2000 +#define SEQUENCE_SAMPLES (SEQUENCE_LENGTH*FRAME_SIZE) static unsigned rand_lcg(unsigned *seed) { *seed = 1664525**seed + 1013904223; @@ -68,7 +69,7 @@ float xn[SEQUENCE_LENGTH*FRAME_SIZE]; int main(int argc, char **argv) { - int i; + int i, j; int count=0; static const float a_hp[2] = {-1.99599, 0.99600}; static const float b_hp[2] = {-2, 1}; @@ -115,6 +116,7 @@ int main(int argc, char **argv) { float Exp[NB_BANDS]; float features[NB_FEATURES]; float g[NB_BANDS]; + float speech_rms, noise_rms; if ((count%1000)==0) fprintf(stderr, "%d\r", count); speech_pos = (rand_lcg(&seed)*2.3283e-10)*speech_length; noise_pos = (rand_lcg(&seed)*2.3283e-10)*noise_length; @@ -131,8 +133,8 @@ int main(int argc, char **argv) { start_pos = IMIN(start_pos, SEQUENCE_LENGTH*FRAME_SIZE); RNN_CLEAR(speech16, start_pos); - speech_gain = pow(10., (-40+(rand()%60))/20.); - noise_gain = pow(10., (-30+(rand()%50))/20.); + speech_gain = pow(10., (-40+(rand()%55))/20.); + noise_gain = pow(10., (-30+(rand()%40))/20.); if (rand()%10==0) noise_gain = 0; noise_gain *= speech_gain; rand_resp(a_noise, b_noise); @@ -146,13 +148,12 @@ int main(int argc, char **argv) { } for (frame=0;frame 10*FRAME_SIZE) { + speech_rms = sqrt(speech_rms/(SEQUENCE_SAMPLES-start_pos)); + } else { + speech_rms = 3000; + } + if (speech_rms < 300) speech_rms = 300; + noise_rms = sqrt(noise_rms/SEQUENCE_SAMPLES); + + speech_gain *= 3000.f/(1+speech_rms); + noise_gain *= 3000.f/(1+noise_rms); + for (j=0;j