Merge pull request mozilla#1020 from tilmankamp/common_voice_importer

Better resuming; additional README content; small improvements
lmangani · Nov 29, 2017 · 06aa605 · 06aa605
2 parents 2d3ab34 + aeada86
commit 06aa605
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Project DeepSpeech is an open source Speech-To-Text engine. It uses a model trai
 
 ![Usage](images/usage.gif)
 
-Pre-built binaries that can be used for performing inference with a trained model can be installed with `pip`. Proper setup using virtual environment is recommended and you can find that documented [below](#using-the-python-package). 
+Pre-built binaries that can be used for performing inference with a trained model can be installed with `pip`. Proper setup using virtual environment is recommended and you can find that documented [below](#using-the-python-package).
 
 Once installed you can then use the `deepspeech` binary to do speech-to-text on an audio file:
 
@@ -200,16 +200,20 @@ pip install /tmp/tensorflow_gpu_warpctc-1.3.0rc0-cp27-cp27mu-linux_x86_64.whl
 ### Common Voice training data
 
 The Common Voice corpus consists of voice samples that were donated through [Common Voice](https://voice.mozilla.org/).
-For automatically downloading and importing the corpus into a given directory, you can call:
+We provide an importer, that automates the whole process of downloading and preparing the corpus.
+You just specify a target directory where all Common Voice contents should go.
+If you already downloaded the Common Voice corpus archive from [here](https://voice.mozilla.org/data), you can simply run the import script on the directory where the corpus is located.
+The importer will then skip downloading it and immediately proceed to unpackaging and importing.
+To start the import process, you can call:
 
 ```bash
-bin/import_cv.py path/to/a/directory
+bin/import_cv.py path/to/target/directory
 ```
 
-Please be aware that this requires at least 70GB of free disk space and quite some time to conclude. 
+Please be aware that this requires at least 70GB of free disk space and quite some time to conclude.
 As this process creates a huge number of small files, using an SSD drive is highly recommended.
-If the import script gets interrupted, it will try to continue from where it stopped the next time you run it. 
-Unfortunately, there are some cases where it will need to start over. 
+If the import script gets interrupted, it will try to continue from where it stopped the next time you run it.
+Unfortunately, there are some cases where it will need to start over.
 Once the import is done, the directory will contain a bunch of CSV files.
 
 The following files are official user-validated sets for training, validating and testing:
@@ -229,7 +233,7 @@ The following files are the non-validated unofficial sets for training, validati
 A sub-directory called `cv_corpus_{version}` contains the mp3 and wav files that were extracted from an archive named `cv_corpus_{version}.tar.gz`.
 All entries in the CSV files refer to their samples by absolute paths. So moving this sub-directory would require another import or tweaking the CSV files accordingly.
 
-To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into `--train_files`, `--dev_files`, `--test_files` parameters of `DeepSpeech.py`. 
+To use Common Voice data during training, validation and testing, you pass (comma separated combinations of) their filenames into `--train_files`, `--dev_files`, `--test_files` parameters of `DeepSpeech.py`.
 If, for example, Common Voice was imported into `../data/CV`, `DeepSpeech.py` could be called like this:
 
 ```bash

diff --git a/bin/import_cv.py b/bin/import_cv.py
@@ -9,101 +9,132 @@
 
 import csv
 import tarfile
+import requests
 import subprocess
 from glob import glob
 from os import path
 from sox import Transformer
 from threading import Lock
-from tensorflow.contrib.learn.python.learn.datasets import base
-from tensorflow.python.platform import gfile
 from multiprocessing.dummy import Pool
 from multiprocessing import cpu_count
 from util.progress import print_progress
 
-FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
+FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
 SAMPLE_RATE = 16000
 MAX_SECS = 10
-CV_DIR = "cv_corpus_v1"
-CV_DATA = "cv_corpus_v1.tar.gz"
-CV_DATA_URL = "https://s3.us-east-2.amazonaws.com/common-voice-data-download/cv_corpus_v1.tar.gz"
+ARCHIVE_DIR_NAME = 'cv_corpus_v1'
+ARCHIVE_NAME = ARCHIVE_DIR_NAME + '.tar.gz'
+ARCHIVE_URL = 'https://s3.us-east-2.amazonaws.com/common-voice-data-download/' + ARCHIVE_NAME
 
-def _download_and_preprocess_data(data_dir):
+def _download_and_preprocess_data(target_dir):
     # Making path absolute
-    data_dir = path.abspath(data_dir)
+    target_dir = path.abspath(target_dir)
     # Conditionally download data
-    local_file = base.maybe_download(CV_DATA, data_dir, CV_DATA_URL)
+    archive_path = _maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
     # Conditionally extract common voice data
-    _maybe_extract(data_dir, CV_DIR, local_file)
+    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
     # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
-    _maybe_convert_sets(data_dir, CV_DIR)
+    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
 
-def _maybe_extract(data_dir, extracted_data, archive):
-    # If data_dir/extracted_data does not exist, extract archive in data_dir
-    if not gfile.Exists(path.join(data_dir, extracted_data)):
-        print('Extracting "%s"...' % archive)
-        with tarfile.open(archive) as tar:
+def _maybe_download(archive_name, target_dir, archive_url):
+    # If archive file does not exist, download it...
+    archive_path = path.join(target_dir, archive_name)
+    if not path.exists(archive_path):
+        print('No archive "%s" - downloading...' % archive_path)
+        req = requests.get(archive_url, stream=True)
+        total_size = int(req.headers.get('content-length', 0))
+        done = 0
+        with open(archive_path, 'wb') as f:
+            for data in req.iter_content(1024*1024):
+                done += len(data)
+                f.write(data)
+                print_progress(done, total_size)
+    else:
+        print('Found archive "%s" - not downloading.' % archive_path)
+    return archive_path
+
+def _maybe_extract(target_dir, extracted_data, archive_path):
+    # If target_dir/extracted_data does not exist, extract archive in target_dir
+    extracted_path = path.join(target_dir, extracted_data)
+    if not path.exists(extracted_path):
+        print('No directory "%s" - extracting archive...' % archive_path)
+        with tarfile.open(archive_path) as tar:
             members = list(tar.getmembers())
             for i, member in enumerate(members):
                 print_progress(i + 1, len(members))
-                tar.extract(member, path=data_dir)
+                tar.extract(member, path=target_dir)
+    else:
+        print('Found directory "%s" - not extracting it from archive.' % archive_path)
 
-def _maybe_convert_sets(data_dir, extracted_data):
-    extracted_dir = path.join(data_dir, extracted_data)
+def _maybe_convert_sets(target_dir, extracted_data):
+    extracted_dir = path.join(target_dir, extracted_data)
     for source_csv in glob(path.join(extracted_dir, '*.csv')):
-        _maybe_convert_set(extracted_dir, source_csv, path.join(data_dir, os.path.split(source_csv)[-1]))
-        
+        _maybe_convert_set(extracted_dir, source_csv, path.join(target_dir, os.path.split(source_csv)[-1]))
+
 def _maybe_convert_set(extracted_dir, source_csv, target_csv):
-    if gfile.Exists(target_csv):
+    print()
+    if path.exists(target_csv):
+        print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv))
         return
-    print('Importing "%s" and its listed mp3 files...' % source_csv)
+    print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv))
     samples = []
     with open(source_csv) as source_csv_file:
         reader = csv.DictReader(source_csv_file)
         for row in reader:
             samples.append((row['filename'], row['text']))
 
-    with open(target_csv, 'w') as target_csv_file:
-        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
-        writer.writeheader()
-        # Mutable counters for the concurrent embedded routine
-        counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
-        lock = Lock()
-        num_samples = len(samples)
-
-        def one_sample(sample):
-            mp3_filename = path.join(*(sample[0].split('/')))
-            mp3_filename = path.join(extracted_dir, mp3_filename)
-            # Storing wav files next to the mp3 ones - just with a different suffix
-            wav_filename = path.splitext(mp3_filename)[0] + ".wav"
-            _maybe_convert_wav(mp3_filename, wav_filename)
-            frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
+    # Mutable counters for the concurrent embedded routine
+    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
+    lock = Lock()
+    num_samples = len(samples)
+    rows = []
+
+    def one_sample(sample):
+        mp3_filename = path.join(*(sample[0].split('/')))
+        mp3_filename = path.join(extracted_dir, mp3_filename)
+        # Storing wav files next to the mp3 ones - just with a different suffix
+        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
+        _maybe_convert_wav(mp3_filename, wav_filename)
+        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
+        file_size = path.getsize(wav_filename)
+        with lock:
             if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
                 # Excluding samples that are too short to fit the transcript
                 counter['too_short'] += 1
             elif frames/SAMPLE_RATE > MAX_SECS:
                 # Excluding very long samples to keep a reasonable batch-size
                 counter['too_long'] += 1
             else:
-                writer.writerow({ 'wav_filename': wav_filename,
-                                  'wav_filesize': path.getsize(wav_filename),
-                                  'transcript': sample[1] })
+                # This one is good - keep it for the target CSV
+                rows.append((wav_filename, file_size, sample[1]))
+            print_progress(counter['all'], num_samples)
             counter['all'] += 1
-            with lock: 
-                print_progress(counter['all'], num_samples)
-
-        pool = Pool(cpu_count())
-        pool.map(one_sample, samples)
-        pool.close()
-        pool.join()
-
-        print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
-        if counter['too_short'] > 0:
-            print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
-        if counter['too_long'] > 0:
-            print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
+
+    print('Importing mp3 files...')
+    pool = Pool(cpu_count())
+    pool.map(one_sample, samples)
+    pool.close()
+    pool.join()
+
+    print_progress(num_samples, num_samples)
+
+    print('Writing "%s"...' % target_csv)
+    with open(target_csv, 'w') as target_csv_file:
+        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        for i, row in enumerate(rows):
+            filename, file_size, transcript = row
+            print_progress(i + 1, len(rows))
+            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
+
+    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
+    if counter['too_short'] > 0:
+        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
+    if counter['too_long'] > 0:
+        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
 
 def _maybe_convert_wav(mp3_filename, wav_filename):
-    if not gfile.Exists(wav_filename):
+    if not path.exists(wav_filename):
         transformer = Transformer()
         transformer.convert(samplerate=SAMPLE_RATE)
         transformer.build(mp3_filename, wav_filename)