forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeepspeech_utils.cc
95 lines (82 loc) · 2.64 KB
/
deepspeech_utils.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "deepspeech_utils.h"
#include "c_speech_features.h"
#include <stdlib.h>
#define COEFF 0.97f
#define WIN_LEN 0.025f
#define WIN_STEP 0.01f
#define N_FFT 512
#define N_FILTERS 26
#define LOWFREQ 0
#define CEP_LIFTER 22
namespace DeepSpeech {
void
audioToInputVector(const short* aBuffer, unsigned int aBufferSize,
int aSampleRate, int aNCep, int aNContext, float** aMfcc,
int* aNFrames, int* aFrameLen)
{
const int contextSize = aNCep * aNContext;
const int frameSize = aNCep + (2 * aNCep * aNContext);
// Compute MFCC features
float* mfcc;
int n_frames = csf_mfcc(aBuffer, aBufferSize, aSampleRate,
WIN_LEN, WIN_STEP, aNCep, N_FILTERS, N_FFT,
LOWFREQ, aSampleRate/2, COEFF, CEP_LIFTER, 1, NULL,
&mfcc);
// Take every other frame (BiRNN stride of 2) and add past/future context
int ds_input_length = (n_frames + 1) / 2;
// TODO: Use MFCC of silence instead of zero
float* ds_input = (float*)calloc(ds_input_length * frameSize, sizeof(float));
for (int i = 0, idx = 0, mfcc_idx = 0; i < ds_input_length;
i++, idx += frameSize, mfcc_idx += aNCep * 2) {
// Past context
for (int j = aNContext; j > 0; j--) {
int frame_index = (i - j) * 2;
if (frame_index < 0) { continue; }
int mfcc_base = frame_index * aNCep;
int base = (aNContext - j) * aNCep;
for (int k = 0; k < aNCep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k];
}
}
// Present context
for (int j = 0; j < aNCep; j++) {
ds_input[idx + j + contextSize] = mfcc[mfcc_idx + j];
}
// Future context
for (int j = 1; j <= aNContext; j++) {
int frame_index = (i + j) * 2;
if (frame_index >= n_frames) { break; }
int mfcc_base = frame_index * aNCep;
int base = contextSize + aNCep + ((j - 1) * aNCep);
for (int k = 0; k < aNCep; k++) {
ds_input[idx + base + k] = mfcc[mfcc_base + k];
}
}
}
// Free mfcc array
free(mfcc);
// Whiten inputs (TODO: Should we whiten)
double n_inputs = (double)(ds_input_length * frameSize);
double mean = 0.0;
for (int idx = 0; idx < n_inputs; idx++) {
mean += ds_input[idx] / n_inputs;
}
double stddev = 0.0;
for (int idx = 0; idx < n_inputs; idx++) {
stddev += pow(fabs(ds_input[idx] - mean), 2.0) / n_inputs;
}
stddev = sqrt(stddev);
for (int idx = 0; idx < n_inputs; idx++) {
ds_input[idx] = (float)((ds_input[idx] - mean) / stddev);
}
if (aMfcc) {
*aMfcc = ds_input;
}
if (aNFrames) {
*aNFrames = ds_input_length;
}
if (aFrameLen) {
*aFrameLen = frameSize;
}
}
}