Chapter07/data_process.py

"""Creates SequenceExamples and stores them in TFRecords format.
Computes spectral features from raw audio waveforms and groups the audio into
multiple TFRecords files based on their length. The utterances are stored in
sorted order based on length to allow for sorta-grad implementation.
Note:
This script can take a few hours to run to compute and store the mfcc
features on the 100 hour Librispeech dataset.
"""
import os
import glob
import soundfile as sf
from python_speech_features import mfcc
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt


def compute_mfcc(audio_data, sample_rate):
    ''' Computes the mel-frequency cepstral coefficients.
    The audio time series is normalised and its mfcc features are computed.
    Args:
        audio_data: time series of the speech utterance.
        sample_rate: sampling rate.
    Returns:
        mfcc_feat:[num_frames x F] matrix representing the mfcc.
    '''

    audio_data = audio_data - np.mean(audio_data)
    audio_data = audio_data / np.max(audio_data)
    mfcc_feat = mfcc(audio_data, sample_rate, winlen=0.025, winstep=0.01,
                     numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None,
                     preemph=0.97, ceplifter=22, appendEnergy=True)
    return mfcc_feat


def make_example(seq_len, spec_feat, labels):
    ''' Creates a SequenceExample for a single utterance.
    This function makes a SequenceExample given the sequence length,
    mfcc features and corresponding transcript.
    These sequence examples are read using tf.parse_single_sequence_example
    during training.
    Note: Some of the tf modules used in this function(such as
    tf.train.Feature) do not have comprehensive documentation in v0.12.
    This function was put together using the test routines in the
    tensorflow repo.
    See: https://github.com/tensorflow/tensorflow/
    blob/246a3724f5406b357aefcad561407720f5ccb5dc/
    tensorflow/python/kernel_tests/parsing_ops_test.py
    Args:
        seq_len: integer represents the sequence length in time frames.
        spec_feat: [TxF] matrix of mfcc features.
        labels: list of ints representing the encoded transcript.
    Returns:
        Serialized sequence example.
    '''
    # Feature lists for the sequential features of the example
    feats_list = [tf.train.Feature(float_list=tf.train.FloatList(value=frame))
                  for frame in spec_feat]
    feat_dict = {"feats": tf.train.FeatureList(feature=feats_list)}
    sequence_feats = tf.train.FeatureLists(feature_list=feat_dict)

    # Context features for the entire sequence
    len_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=[seq_len]))
    label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=labels))

    context_feats = tf.train.Features(feature={"seq_len": len_feat,
                                               "labels": label_feat})

    ex = tf.train.SequenceExample(context=context_feats,
                                  feature_lists=sequence_feats)

    return ex.SerializeToString()


def process_data(partition):
    """ Reads audio waveform and transcripts from a dataset partition
    and generates mfcc featues.
    Args:
        parition - represents the dataset partition name.
    Returns:
        feats: dict containing mfcc feature per utterance
        transcripts: dict of lists representing transcript.
        utt_len: dict of ints holding sequence length of each
                 utterance in time frames.
    """

    feats = {}
    transcripts = {}
    utt_len = {}  # Required for sorting the utterances based on length

    for filename in glob.iglob(partition+'/**/*.txt', recursive=True):
        with open(filename, 'r') as file:
            for line in file:
                parts = line.split()
                audio_file = parts[0]
                file_path = os.path.join(os.path.dirname(filename),
                                         audio_file+'.flac')
                audio, sample_rate = sf.read(file_path)                
                feats[audio_file] = compute_mfcc(audio, sample_rate)
                plot_audio(audio,feats[audio_file])
                utt_len[audio_file] = feats[audio_file].shape[0]
                target = ' '.join(parts[1:])
                transcripts[audio_file] = [CHAR_TO_IX[i] for i in target]
    return feats, transcripts, utt_len

def plot_audio(audio , mfcc):
    fig, axs = plt.subplots(2, 1, figsize=(10, 5))
    axs[0].plot(audio);
    axs[0].set_title('Raw Audio Signal')
    axs[1].plot(mfcc);
    axs[1].set_title('mel-frequency cepstral coefficients')

def create_records():
    """ Pre-processes the raw audio and generates TFRecords.
    This function computes the mfcc features, encodes string transcripts
    into integers, and generates sequence examples for each utterance.
    Multiple sequence records are then written into TFRecord files.
    """
    for partition in sorted(glob.glob(AUDIO_PATH+'/*')):
        print('Processing' + partition)
        feats, transcripts, utt_len = process_data(partition)
        sorted_utts = sorted(utt_len, key=utt_len.get)
        # bin into groups of 100 frames.
        max_t = int(utt_len[sorted_utts[-1]]/100)
        min_t = int(utt_len[sorted_utts[0]]/100)

        # Create destination directory
        write_dir = '../data/librispeech/processed/' + partition.split('/')[-1]
        if tf.gfile.Exists(write_dir):
            tf.gfile.DeleteRecursively(write_dir)
        tf.gfile.MakeDirs(write_dir)

        if os.path.basename(partition) == 'train-clean-100':
            # Create multiple TFRecords based on utterance length for training
            writer = {}
            count = {}
            print('Processing training files...')
            for i in range(min_t, max_t+1):
                filename = os.path.join(write_dir, 'train' + '_' + str(i) +
                                        '.tfrecords')
                writer[i] = tf.python_io.TFRecordWriter(filename)
                count[i] = 0

            for utt in tqdm(sorted_utts):
                example = make_example(utt_len[utt], feats[utt].tolist(),
                                       transcripts[utt])
                index = int(utt_len[utt]/100)
                writer[index].write(example)
                count[index] += 1

            for i in range(min_t, max_t+1):
                writer[i].close()
            print(count)

            # Remove bins which have fewer than 20 utterances
            for i in range(min_t, max_t+1):
                if count[i] < 20:
                    os.remove(os.path.join(write_dir, 'train' +
                                           '_' + str(i) + '.tfrecords'))
        else:
            # Create single TFRecord for dev and test partition
            filename = os.path.join(write_dir, os.path.basename(write_dir) +
                                    '.tfrecords')
            print('Creating', filename)
            record_writer = tf.python_io.TFRecordWriter(filename)
            for utt in sorted_utts:
                example = make_example(utt_len[utt], feats[utt].tolist(),
                                       transcripts[utt])
                record_writer.write(example)
            record_writer.close()
            print('Processed '+str(len(sorted_utts))+' audio files')

# Audio path is the location of the directory that contains the librispeech
# data partitioned into three folders: dev-clean, train-clean-100, test-clean
AUDIO_PATH = '../data/librispeech/audio/'
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ' "
CHAR_TO_IX = {ch: i for (i, ch) in enumerate(ALPHABET)}

if __name__ == '__main__':
    create_records()