Merge pull request espnet#1241 from ftshijt/ASR_commonvoice_recipe_up…

…date Asr commonvoice recipe update
mtingzhi · Oct 30, 2019 · a03766a · a03766a
2 parents 9c7ed9b + ec47212
commit a03766a
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 40 deletions.
diff --git a/egs/README.md b/egs/README.md
@@ -12,7 +12,7 @@
 | chime4                  | The 4th CHiME Speech Separation and Recognition Challenge    | ASR/Multichannel ASR                       | EN             | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/      |                               |
 | chime5                  | The 5th CHiME Speech Separation and Recognition Challenge    | ASR                                        | EN             | http://spandh.dcs.shef.ac.uk/chime_challenge/                |                               |
 | cmu_wilderness          | CMU Wilderness Multilingual Speech Dataset                   | Multilingual ASR                           | ~100 Languages | https://github.com/festvox/datasets-CMU_Wilderness           |                               |
-| commonvoice             | The Mozilla Common Voice corpus v1.                          | ASR                                        | EN             | https://voice.mozilla.org/datasets                           |                               |
+| commonvoice             | The Mozilla Common Voice                                     | ASR                                        | 13 Languages   | https://voice.mozilla.org/datasets                           |                               |
 | csj                     | Corpus of Spontaneous Japanese                               | ASR                                        | JP             | https://pj.ninjal.ac.jp/corpus_center/csj/en/                |                               |
 | csmsc                   | Chinese Standard Mandarin Speech Copus                       | TTS                                        | ZH             | https://www.data-baker.com/open_source.html                  |                               |
 | dirha_wsj               | Distant-speech Interaction for Robust Home Applications      | Multi-Array ASR                            | EN             | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj|                               |

diff --git a/egs/commonvoice/asr1/local/data_prep.pl b/egs/commonvoice/asr1/local/data_prep.pl
@@ -11,19 +11,28 @@
   exit(1);
 }
 
+# use ffmpeg for mp3 to wav
+if (length(`which ffmpeg`) == 0) {
+  print "Please install 'ffmpeg' on All worker nodes!\n";
+  exit 1;
+}
+
+
 ($db_base, $dataset, $out_dir) = @ARGV;
 mkdir data unless -d data;
 mkdir $out_dir unless -d $out_dir;
 
-open(CSV, "<", "$db_base/$dataset.csv") or die "cannot open dataset CSV file";
+open(CSV, "<", "$db_base/$dataset.tsv");
+
+open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file";
 open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
 open(GNDR,">", "$out_dir/utt2gender") or die "Could not open the output file $out_dir/utt2gender";
 open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text";
 open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
 my $header = <CSV>;
 while(<CSV>) {
   chomp;
-  ($filepath, $text, $upvotes, $downvotes, $age, $gender, $accent, $duration) = split(",", $_);
+  ($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_);
   if ("$gender" eq "female") {
     $gender = "f";
   } else {
@@ -33,20 +42,12 @@
   $uttId = $filepath;
   $uttId =~ s/\.mp3//g;
   $uttId =~ tr/\//-/;
-  # No speaker information is provided, so we treat each utterance as coming from a different speaker
-  $spkr = $uttId;
-  $text =~ s/ said 'eat when/ said eat when/g;
-  $text =~ s/'and this is what your son said'/and this is what your son said/g;
-  $text =~ s/^'m /i'm /g;
-  $text =~ s/'mummy'/mummy/g;
-  $text =~ s/'poppy'/poppy/g;
-  $text =~ s/'every/every/g;
-  $text =~ s/'super fun playground'/super fun playground/g;
-  $text =~ s/'under construction'/under construction/g;
+  # speaker information should be suffix of the utterance Id
+  $uttId = "$spkr-$uttId";
   $text =~ tr/a-z/A-Z/;
   print TEXT "$uttId"," ","$text","\n";
   print GNDR "$uttId"," ","$gender","\n";
-  print WAV "$uttId"," sox $db_base/$filepath -t wav -r 16k -b 16 -e signed - |\n";
+  print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 - |\n";
   print SPKR "$uttId"," $spkr","\n";
 }
 close(SPKR) || die;

diff --git a/egs/commonvoice/asr1/local/download_and_untar.sh b/egs/commonvoice/asr1/local/download_and_untar.sh
@@ -15,13 +15,16 @@ if [ "$1" == --remove-archive ]; then
 fi
 
 if [ $# -ne 2 ]; then
-  echo "Usage: $0 [--remove-archive] <data-base> <url>"
-  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
 fi
 
 data=$1
 url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
 
 if [ ! -d "$data" ]; then
   echo "$0: no such directory $data"
@@ -33,13 +36,11 @@ if [ -z "$url" ]; then
   exit 1;
 fi
 
-if [ -f $data/cv_corpus_v1/.complete ]; then
+if [ -f $data/$filename.complete ]; then
   echo "$0: data was already successfully extracted, nothing to do."
   exit 0;
 fi
 
-filepath="$data/cv_corpus_v1.tar.gz"
-filesize="12852160484"
 
 if [ -f $filepath ]; then
   size=$(/bin/ls -l $filepath | awk '{print $5}')
@@ -66,16 +67,19 @@ if [ ! -f $filepath ]; then
     echo "$0: error executing wget $url"
     exit 1;
   fi
+  cd $workspace
 fi
 
 cd $data
 
-if ! tar -xzf $filepath; then
+if ! tar -xzf $filename; then
   echo "$0: error un-tarring archive $filepath"
   exit 1;
 fi
 
-touch $data/cv_corpus_v1/.complete
+cd $workspace
+
+touch $data/$filename.complete
 
 echo "$0: Successfully downloaded and un-tarred $filepath"
 

diff --git a/egs/commonvoice/asr1/local/filter_text.py b/egs/commonvoice/asr1/local/filter_text.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import codecs
+from io import open
+import sys
+
+
+sys.stdin = codecs.getreader('utf-8')(
+    sys.stdin.buffer)
+sys.stdout = codecs.getwriter('utf-8')(
+    sys.stdout.buffer)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filter-list', '-f', type=str,
+                        help='filter list')
+    args = parser.parse_args()
+
+    with open(args.filter_list, encoding='utf-8') as f:
+        fil = [x.rstrip() for x in f]
+
+    for x in sys.stdin:
+        # extract text parts
+        text = ' '.join(x.rstrip().split()[1:])
+        if text in fil:
+            print(x.split()[0], text)
diff --git a/egs/commonvoice/asr1/local/split_tr_dt_et.sh b/egs/commonvoice/asr1/local/split_tr_dt_et.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+perdt=10 # percent for dev set
+peret=10 # percent for eval set
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+    echo "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir>";
+    exit 1;
+fi
+
+sdata=$1
+trdata=$2
+dtdata=$3
+etdata=$4
+
+tmpdata=$trdata/tmp
+mkdir -p $tmpdata
+mkdir -p $dtdata
+mkdir -p $etdata
+
+# make a unique prompts files
+# some transcripts have multiple spaces and need tr -s " " to remove them
+cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts
+num_prompt=`wc -l $tmpdata/prompts | awk '{print $1}'`
+
+num_dt=`echo "$num_prompt * $perdt / 100" | bc`
+num_et=`echo "$num_prompt * $peret / 100" | bc`
+echo "number of dev set prompts: $num_dt"
+echo "number of eval set prompts: $num_et"
+
+# dt
+utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts
+# et
+utils/shuffle_list.pl $tmpdata/prompts | head -n `echo "$num_dt + $num_et" | bc` \
+    | tail -n $num_et > $tmpdata/et_prompts
+# tr
+nrest=`echo "$num_dt + $num_et + 1" | bc`
+utils/shuffle_list.pl $tmpdata/prompts | \
+    tail -n +$nrest > $tmpdata/tr_prompts
+echo "number of train set prompts: `wc -l $tmpdata/tr_prompts | awk '{print $1}'`"
+
+# it takes very long time when # prompts is large
+cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids
+echo "finished text extraction for dev set #utt = `wc -l $tmpdata/dt.ids | awk '{print $1}'`"
+cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids
+echo "finished text extraction for dev set #utt = `wc -l $tmpdata/et.ids | awk '{print $1}'`"
+cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids
+cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids
+diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^</{print $2}' | sort > $tmpdata/tr.ids
+echo "finished text extraction for dev set #utt = `wc -l $tmpdata/tr.ids | awk '{print $1}'`"
+
+reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata
+reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata
+reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata
+
+utils/fix_data_dir.sh $dtdata
+utils/fix_data_dir.sh $etdata
+utils/fix_data_dir.sh $trdata
diff --git a/egs/commonvoice/asr1/run.sh b/egs/commonvoice/asr1/run.sh
@@ -8,7 +8,7 @@
 
 # general configuration
 backend=pytorch
-stage=0        # start from 0 if you need to start from data preparation
+stage=-1       # start from 0 if you need to start from data preparation
 stop_stage=100
 ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
 debugmode=1
@@ -32,13 +32,11 @@ lmtag=            # tag for managing LMs
 recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
 n_average=10
 
-# Set this to somewhere where you want to put your data, or where
-# someone else has already put it.  You'll want to change this
-# if you're not on the CLSP grid.
-datadir=/path/cv_corpus_v1
+datadir=downloads # original data directory to be stored
+lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru
 
 # base url for downloads.
-data_url=https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz
+data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/$lang.tar.gz
 
 # bpemode (unigram or bpe)
 nbpe=150
@@ -55,23 +53,34 @@ set -e
 set -u
 set -o pipefail
 
-train_set=valid_train
-train_dev=valid_dev
-recog_set="valid_dev valid_test"
+train_set=valid_train_${lang}
+train_dev=valid_dev_${lang}
+test_set=valid_test_${lang}
+recog_set="valid_dev_${lang} valid_test_${lang}"
 
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then 
     echo "stage -1: Data Download"
     mkdir -p ${datadir}
-    local/download_and_untar.sh ${datadir} ${data_url}
+    local/download_and_untar.sh ${datadir} ${data_url} ${lang}.tar.gz
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
     ### But you can utilize Kaldi recipes in most cases 
-    for part in valid-train valid-dev valid-test; do
+    for part in "validated"; do
         # use underscore-separated names in data directories.
-        local/data_prep.pl ${datadir}/ cv-${part} data/"$(echo ${part} | tr - _)"
+        local/data_prep.pl ${datadir} ${part} data/"$(echo "${part}_${lang}" | tr - _)"
     done
+
+    # Kaldi Version Split
+    # ./utils/subset_data_dir_tr_cv.sh data/validated data/valid_train data/valid_test_dev
+    # ./utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/valid_test_dev data/valid_test data/valid_dev
+
+    # ESPNet Version (same as voxforge)
+    # consider duplicated sentences (does not consider speaker split)
+    # filter out the same sentences (also same text) of test&dev set from validated set
+    echo data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
+    local/split_tr_dt_et.sh data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
 fi
 
 feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
@@ -82,18 +91,18 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature Generation"
     fbankdir=fbank
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
-    for x in valid_train valid_dev valid_test; do
+    for x in ${train_set} ${train_dev} ${recog_set}; do
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 4 --write_utt2num_frames true \
                                   data/${x} exp/make_fbank/${x} ${fbankdir}
         utils/fix_data_dir.sh data/${x}
     done
     # Remove features with too long frames in training data
     max_len=3000
-    mv data/valid_train/utt2num_frames data/valid_train/utt2num_frames.bak
-    awk -v max_len=${max_len} '$2 < max_len {print $1, $2}' data/valid_train/utt2num_frames.bak > data/valid_train/utt2num_frames
-    utils/filter_scp.pl data/valid_train/utt2num_frames data/valid_train/utt2spk > data/valid_train/utt2spk.new
-    mv data/valid_train/utt2spk.new data/valid_train/utt2spk
-    utils/fix_data_dir.sh data/valid_train
+    mv data/${train_set}/utt2num_frames data/${train_set}/utt2num_frames.bak
+    awk -v max_len=${max_len} '$2 < max_len {print $1, $2}' data/${train_set}/utt2num_frames.bak > data/${train_set}/utt2num_frames
+    utils/filter_scp.pl data/${train_set}/utt2num_frames data/${train_set}/utt2spk > data/${train_set}/utt2spk.new
+    mv data/${train_set}/utt2spk.new data/${train_set}/utt2spk
+    utils/fix_data_dir.sh data/${train_set}
 
     # compute global CMVN
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark