-
Notifications
You must be signed in to change notification settings - Fork 139
/
Copy pathdata_process.py
181 lines (159 loc) · 7.51 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""Creates SequenceExamples and stores them in TFRecords format.
Computes spectral features from raw audio waveforms and groups the audio into
multiple TFRecords files based on their length. The utterances are stored in
sorted order based on length to allow for sorta-grad implementation.
Note:
This script can take a few hours to run to compute and store the mfcc
features on the 100 hour Librispeech dataset.
"""
import os
import glob
import soundfile as sf
from python_speech_features import mfcc
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
def compute_mfcc(audio_data, sample_rate):
''' Computes the mel-frequency cepstral coefficients.
The audio time series is normalised and its mfcc features are computed.
Args:
audio_data: time series of the speech utterance.
sample_rate: sampling rate.
Returns:
mfcc_feat:[num_frames x F] matrix representing the mfcc.
'''
audio_data = audio_data - np.mean(audio_data)
audio_data = audio_data / np.max(audio_data)
mfcc_feat = mfcc(audio_data, sample_rate, winlen=0.025, winstep=0.01,
numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None,
preemph=0.97, ceplifter=22, appendEnergy=True)
return mfcc_feat
def make_example(seq_len, spec_feat, labels):
''' Creates a SequenceExample for a single utterance.
This function makes a SequenceExample given the sequence length,
mfcc features and corresponding transcript.
These sequence examples are read using tf.parse_single_sequence_example
during training.
Note: Some of the tf modules used in this function(such as
tf.train.Feature) do not have comprehensive documentation in v0.12.
This function was put together using the test routines in the
tensorflow repo.
See: https://github.com/tensorflow/tensorflow/
blob/246a3724f5406b357aefcad561407720f5ccb5dc/
tensorflow/python/kernel_tests/parsing_ops_test.py
Args:
seq_len: integer represents the sequence length in time frames.
spec_feat: [TxF] matrix of mfcc features.
labels: list of ints representing the encoded transcript.
Returns:
Serialized sequence example.
'''
# Feature lists for the sequential features of the example
feats_list = [tf.train.Feature(float_list=tf.train.FloatList(value=frame))
for frame in spec_feat]
feat_dict = {"feats": tf.train.FeatureList(feature=feats_list)}
sequence_feats = tf.train.FeatureLists(feature_list=feat_dict)
# Context features for the entire sequence
len_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_len]))
label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=labels))
context_feats = tf.train.Features(feature={"seq_len": len_feat,
"labels": label_feat})
ex = tf.train.SequenceExample(context=context_feats,
feature_lists=sequence_feats)
return ex.SerializeToString()
def process_data(partition):
""" Reads audio waveform and transcripts from a dataset partition
and generates mfcc featues.
Args:
parition - represents the dataset partition name.
Returns:
feats: dict containing mfcc feature per utterance
transcripts: dict of lists representing transcript.
utt_len: dict of ints holding sequence length of each
utterance in time frames.
"""
feats = {}
transcripts = {}
utt_len = {} # Required for sorting the utterances based on length
for filename in glob.iglob(partition+'/**/*.txt', recursive=True):
with open(filename, 'r') as file:
for line in file:
parts = line.split()
audio_file = parts[0]
file_path = os.path.join(os.path.dirname(filename),
audio_file+'.flac')
audio, sample_rate = sf.read(file_path)
feats[audio_file] = compute_mfcc(audio, sample_rate)
plot_audio(audio,feats[audio_file])
utt_len[audio_file] = feats[audio_file].shape[0]
target = ' '.join(parts[1:])
transcripts[audio_file] = [CHAR_TO_IX[i] for i in target]
return feats, transcripts, utt_len
def plot_audio(audio , mfcc):
fig, axs = plt.subplots(2, 1, figsize=(10, 5))
axs[0].plot(audio);
axs[0].set_title('Raw Audio Signal')
axs[1].plot(mfcc);
axs[1].set_title('mel-frequency cepstral coefficients')
def create_records():
""" Pre-processes the raw audio and generates TFRecords.
This function computes the mfcc features, encodes string transcripts
into integers, and generates sequence examples for each utterance.
Multiple sequence records are then written into TFRecord files.
"""
for partition in sorted(glob.glob(AUDIO_PATH+'/*')):
print('Processing' + partition)
feats, transcripts, utt_len = process_data(partition)
sorted_utts = sorted(utt_len, key=utt_len.get)
# bin into groups of 100 frames.
max_t = int(utt_len[sorted_utts[-1]]/100)
min_t = int(utt_len[sorted_utts[0]]/100)
# Create destination directory
write_dir = '../data/librispeech/processed/' + partition.split('/')[-1]
if tf.gfile.Exists(write_dir):
tf.gfile.DeleteRecursively(write_dir)
tf.gfile.MakeDirs(write_dir)
if os.path.basename(partition) == 'train-clean-100':
# Create multiple TFRecords based on utterance length for training
writer = {}
count = {}
print('Processing training files...')
for i in range(min_t, max_t+1):
filename = os.path.join(write_dir, 'train' + '_' + str(i) +
'.tfrecords')
writer[i] = tf.python_io.TFRecordWriter(filename)
count[i] = 0
for utt in tqdm(sorted_utts):
example = make_example(utt_len[utt], feats[utt].tolist(),
transcripts[utt])
index = int(utt_len[utt]/100)
writer[index].write(example)
count[index] += 1
for i in range(min_t, max_t+1):
writer[i].close()
print(count)
# Remove bins which have fewer than 20 utterances
for i in range(min_t, max_t+1):
if count[i] < 20:
os.remove(os.path.join(write_dir, 'train' +
'_' + str(i) + '.tfrecords'))
else:
# Create single TFRecord for dev and test partition
filename = os.path.join(write_dir, os.path.basename(write_dir) +
'.tfrecords')
print('Creating', filename)
record_writer = tf.python_io.TFRecordWriter(filename)
for utt in sorted_utts:
example = make_example(utt_len[utt], feats[utt].tolist(),
transcripts[utt])
record_writer.write(example)
record_writer.close()
print('Processed '+str(len(sorted_utts))+' audio files')
# Audio path is the location of the directory that contains the librispeech
# data partitioned into three folders: dev-clean, train-clean-100, test-clean
AUDIO_PATH = '../data/librispeech/audio/'
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ' "
CHAR_TO_IX = {ch: i for (i, ch) in enumerate(ALPHABET)}
if __name__ == '__main__':
create_records()