Chapter07/DS_input.py

import os.path
import glob
import tensorflow as tf

# Global constants describing the dataset
# Note this definition must match the ALPHABET chosen in
# preprocess_Librispeech.py
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ' "  # for LibriSpeech
NUM_CLASSES = len(ALPHABET) + 1  # Additional class for blank character
NUM_PER_EPOCH_FOR_TRAIN = 28535
NUM_PER_EPOCH_FOR_EVAL = 2703
NUM_PER_EPOCH_FOR_TEST = 2620


def _generate_feats_and_label_batch(filename_queue, batch_size):
    """Construct a queued batch of spectral features and transcriptions.

    Args:
      filename_queue: queue of filenames to read data from.
      batch_size: Number of utterances per batch.

    Returns:
      feats: mfccs. 4D tensor of [batch_size, height, width, 3] size.
      labels: transcripts. List of length batch_size.
      seq_lens: Sequence Lengths. List of length batch_size.
    """

    # Define how to parse the example
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    context_features = {
        "seq_len": tf.FixedLenFeature([], dtype=tf.int64),
        "labels": tf.VarLenFeature(dtype=tf.int64)
    }
    sequence_features = {
        # mfcc features are 13 dimensional
        "feats": tf.FixedLenSequenceFeature([13, ], dtype=tf.float32)
    }

    # Parse the example (returns a dictionary of tensors)
    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )

    # Generate a batch worth of examples after bucketing
    seq_len, (feats, labels) = tf.contrib.training.bucket_by_sequence_length(
        input_length=tf.cast(context_parsed['seq_len'], tf.int32),
        tensors=[sequence_parsed['feats'], context_parsed['labels']],
        batch_size=batch_size,
        bucket_boundaries=list(range(100, 1900, 100)),
        allow_smaller_final_batch=True,
        num_threads=16,
        dynamic_pad=True)

    return feats, tf.cast(labels, tf.int32), seq_len


def inputs(eval_data, data_dir, batch_size, shuffle=False):
    """Construct input for fordspeech evaluation using the Reader ops.

    Args:
      eval_data: bool, indicating if one should use the train or eval data set.
      data_dir: Path to the fordspeech data directory.
      batch_size: Number of images per batch.

    Returns:
      images: Images. 4D tensor of
              [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
      labels: Labels. 1D tensor of [batch_size] size.
    """
    if eval_data == 'train':
        num_files = len(glob.glob(os.path.join(data_dir,
                                               'train*/*.tfrecords')))
        filenames = [os.path.join(data_dir, 'train-clean-100/train_' +
                                  str(i) + '.tfrecords')
                     for i in range(1, num_files+1)]
    elif eval_data == 'val':
        filenames = glob.glob(os.path.join(data_dir, 'dev*/*.tfrecords'))

    elif eval_data == 'test':
        filenames = glob.glob(os.path.join(data_dir, 'test*/*.tfrecords'))

    for file in filenames:
        if not tf.gfile.Exists(file):
            raise ValueError('Failed to find file: ' + file)

    # Create a queue that produces the filenames to read.
    filename_queue = tf.train.string_input_producer(filenames, shuffle=shuffle)

    # Generate a batch of images and labels by building up a queue of examples.
    return _generate_feats_and_label_batch(filename_queue, batch_size)