forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudio.py
68 lines (53 loc) · 2.66 KB
/
audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import absolute_import, print_function
import scipy.io.wavfile as wav
import sys
try:
from deepspeech.utils import audioToInputVector
except ImportError:
import numpy as np
from python_speech_features import mfcc
from six.moves import range
class DeprecationWarning:
displayed = False
def audioToInputVector(audio, fs, numcep, numcontext):
if DeprecationWarning.displayed is not True:
DeprecationWarning.displayed = True
print('------------------------------------------------------------------------', file=sys.stderr)
print('WARNING: libdeepspeech failed to load, resorting to deprecated code', file=sys.stderr)
print(' Refer to README.md for instructions on installing libdeepspeech', file=sys.stderr)
print('------------------------------------------------------------------------', file=sys.stderr)
# Get mfcc coefficients
features = mfcc(audio, samplerate=fs, numcep=numcep)
# We only keep every second feature (BiRNN stride = 2)
features = features[::2]
# One stride per time step in the input
num_strides = len(features)
# Add empty initial and final contexts
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
features = np.concatenate((empty_context, features, empty_context))
# Create a view into the array with overlapping strides of size
# numcontext (past) + 1 (present) + numcontext (future)
window_size = 2*numcontext+1
train_inputs = np.lib.stride_tricks.as_strided(
features,
(num_strides, window_size, numcep),
(features.strides[0], features.strides[0], features.strides[1]),
writeable=False)
# Flatten the second and third dimensions
train_inputs = np.reshape(train_inputs, [num_strides, -1])
# Whiten inputs (TODO: Should we whiten?)
# Copy the strided array so that we can write to it safely
train_inputs = np.copy(train_inputs)
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
# Return results
return train_inputs
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
r"""
Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features
at every 0.01s time step with a window length of 0.025s. Appends ``numcontext``
context frames to the left and right of each time step, and returns this data
in a numpy array.
"""
# Load wav files
fs, audio = wav.read(audio_filename)
return audioToInputVector(audio, fs, numcep, numcontext)