Skip to content

Commit

Permalink
Merge pull request espnet#1241 from ftshijt/ASR_commonvoice_recipe_up…
Browse files Browse the repository at this point in the history
…date

Asr commonvoice recipe update
  • Loading branch information
sw005320 authored Oct 30, 2019
2 parents 9c7ed9b + ec47212 commit a03766a
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 40 deletions.
2 changes: 1 addition & 1 deletion egs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
| chime4 | The 4th CHiME Speech Separation and Recognition Challenge | ASR/Multichannel ASR | EN | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/ | |
| chime5 | The 5th CHiME Speech Separation and Recognition Challenge | ASR | EN | http://spandh.dcs.shef.ac.uk/chime_challenge/ | |
| cmu_wilderness | CMU Wilderness Multilingual Speech Dataset | Multilingual ASR | ~100 Languages | https://github.com/festvox/datasets-CMU_Wilderness | |
| commonvoice | The Mozilla Common Voice corpus v1. | ASR | EN | https://voice.mozilla.org/datasets | |
| commonvoice | The Mozilla Common Voice | ASR | 13 Languages | https://voice.mozilla.org/datasets | |
| csj | Corpus of Spontaneous Japanese | ASR | JP | https://pj.ninjal.ac.jp/corpus_center/csj/en/ | |
| csmsc | Chinese Standard Mandarin Speech Copus | TTS | ZH | https://www.data-baker.com/open_source.html | |
| dirha_wsj | Distant-speech Interaction for Robust Home Applications | Multi-Array ASR | EN | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj| |
Expand Down
27 changes: 14 additions & 13 deletions egs/commonvoice/asr1/local/data_prep.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,28 @@
exit(1);
}

# use ffmpeg for mp3 to wav
if (length(`which ffmpeg`) == 0) {
print "Please install 'ffmpeg' on All worker nodes!\n";
exit 1;
}


($db_base, $dataset, $out_dir) = @ARGV;
mkdir data unless -d data;
mkdir $out_dir unless -d $out_dir;

open(CSV, "<", "$db_base/$dataset.csv") or die "cannot open dataset CSV file";
open(CSV, "<", "$db_base/$dataset.tsv");

open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file";
open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(GNDR,">", "$out_dir/utt2gender") or die "Could not open the output file $out_dir/utt2gender";
open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text";
open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
my $header = <CSV>;
while(<CSV>) {
chomp;
($filepath, $text, $upvotes, $downvotes, $age, $gender, $accent, $duration) = split(",", $_);
($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_);
if ("$gender" eq "female") {
$gender = "f";
} else {
Expand All @@ -33,20 +42,12 @@
$uttId = $filepath;
$uttId =~ s/\.mp3//g;
$uttId =~ tr/\//-/;
# No speaker information is provided, so we treat each utterance as coming from a different speaker
$spkr = $uttId;
$text =~ s/ said 'eat when/ said eat when/g;
$text =~ s/'and this is what your son said'/and this is what your son said/g;
$text =~ s/^'m /i'm /g;
$text =~ s/'mummy'/mummy/g;
$text =~ s/'poppy'/poppy/g;
$text =~ s/'every/every/g;
$text =~ s/'super fun playground'/super fun playground/g;
$text =~ s/'under construction'/under construction/g;
# speaker information should be suffix of the utterance Id
$uttId = "$spkr-$uttId";
$text =~ tr/a-z/A-Z/;
print TEXT "$uttId"," ","$text","\n";
print GNDR "$uttId"," ","$gender","\n";
print WAV "$uttId"," sox $db_base/$filepath -t wav -r 16k -b 16 -e signed - |\n";
print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 - |\n";
print SPKR "$uttId"," $spkr","\n";
}
close(SPKR) || die;
Expand Down
18 changes: 11 additions & 7 deletions egs/commonvoice/asr1/local/download_and_untar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@ if [ "$1" == --remove-archive ]; then
fi

if [ $# -ne 2 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url>"
echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
fi

data=$1
url=$2
filename=$3
filepath="$data/$filename"
workspace=$PWD

if [ ! -d "$data" ]; then
echo "$0: no such directory $data"
Expand All @@ -33,13 +36,11 @@ if [ -z "$url" ]; then
exit 1;
fi

if [ -f $data/cv_corpus_v1/.complete ]; then
if [ -f $data/$filename.complete ]; then
echo "$0: data was already successfully extracted, nothing to do."
exit 0;
fi

filepath="$data/cv_corpus_v1.tar.gz"
filesize="12852160484"

if [ -f $filepath ]; then
size=$(/bin/ls -l $filepath | awk '{print $5}')
Expand All @@ -66,16 +67,19 @@ if [ ! -f $filepath ]; then
echo "$0: error executing wget $url"
exit 1;
fi
cd $workspace
fi

cd $data

if ! tar -xzf $filepath; then
if ! tar -xzf $filename; then
echo "$0: error un-tarring archive $filepath"
exit 1;
fi

touch $data/cv_corpus_v1/.complete
cd $workspace

touch $data/$filename.complete

echo "$0: Successfully downloaded and un-tarred $filepath"

Expand Down
31 changes: 31 additions & 0 deletions egs/commonvoice/asr1/local/filter_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs
from io import open
import sys


sys.stdin = codecs.getreader('utf-8')(
sys.stdin.buffer)
sys.stdout = codecs.getwriter('utf-8')(
sys.stdout.buffer)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--filter-list', '-f', type=str,
help='filter list')
args = parser.parse_args()

with open(args.filter_list, encoding='utf-8') as f:
fil = [x.rstrip() for x in f]

for x in sys.stdin:
# extract text parts
text = ' '.join(x.rstrip().split()[1:])
if text in fil:
print(x.split()[0], text)
65 changes: 65 additions & 0 deletions egs/commonvoice/asr1/local/split_tr_dt_et.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

. ./path.sh

perdt=10 # percent for dev set
peret=10 # percent for eval set

. utils/parse_options.sh

if [ $# != 4 ]; then
echo "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir>";
exit 1;
fi

sdata=$1
trdata=$2
dtdata=$3
etdata=$4

tmpdata=$trdata/tmp
mkdir -p $tmpdata
mkdir -p $dtdata
mkdir -p $etdata

# make a unique prompts files
# some transcripts have multiple spaces and need tr -s " " to remove them
cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts
num_prompt=`wc -l $tmpdata/prompts | awk '{print $1}'`

num_dt=`echo "$num_prompt * $perdt / 100" | bc`
num_et=`echo "$num_prompt * $peret / 100" | bc`
echo "number of dev set prompts: $num_dt"
echo "number of eval set prompts: $num_et"

# dt
utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts
# et
utils/shuffle_list.pl $tmpdata/prompts | head -n `echo "$num_dt + $num_et" | bc` \
| tail -n $num_et > $tmpdata/et_prompts
# tr
nrest=`echo "$num_dt + $num_et + 1" | bc`
utils/shuffle_list.pl $tmpdata/prompts | \
tail -n +$nrest > $tmpdata/tr_prompts
echo "number of train set prompts: `wc -l $tmpdata/tr_prompts | awk '{print $1}'`"

# it takes very long time when # prompts is large
cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids
echo "finished text extraction for dev set #utt = `wc -l $tmpdata/dt.ids | awk '{print $1}'`"
cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids
echo "finished text extraction for dev set #utt = `wc -l $tmpdata/et.ids | awk '{print $1}'`"
cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids
cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids
diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^</{print $2}' | sort > $tmpdata/tr.ids
echo "finished text extraction for dev set #utt = `wc -l $tmpdata/tr.ids | awk '{print $1}'`"

reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata
reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata
reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata

utils/fix_data_dir.sh $dtdata
utils/fix_data_dir.sh $etdata
utils/fix_data_dir.sh $trdata
47 changes: 28 additions & 19 deletions egs/commonvoice/asr1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# general configuration
backend=pytorch
stage=0 # start from 0 if you need to start from data preparation
stage=-1 # start from 0 if you need to start from data preparation
stop_stage=100
ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu)
debugmode=1
Expand All @@ -32,13 +32,11 @@ lmtag= # tag for managing LMs
recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
n_average=10

# Set this to somewhere where you want to put your data, or where
# someone else has already put it. You'll want to change this
# if you're not on the CLSP grid.
datadir=/path/cv_corpus_v1
datadir=downloads # original data directory to be stored
lang=en # en de fr cy tt kab ca zh-TW it fa eu es ru

# base url for downloads.
data_url=https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz
data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/$lang.tar.gz

# bpemode (unigram or bpe)
nbpe=150
Expand All @@ -55,23 +53,34 @@ set -e
set -u
set -o pipefail

train_set=valid_train
train_dev=valid_dev
recog_set="valid_dev valid_test"
train_set=valid_train_${lang}
train_dev=valid_dev_${lang}
test_set=valid_test_${lang}
recog_set="valid_dev_${lang} valid_test_${lang}"

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
mkdir -p ${datadir}
local/download_and_untar.sh ${datadir} ${data_url}
local/download_and_untar.sh ${datadir} ${data_url} ${lang}.tar.gz
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
for part in valid-train valid-dev valid-test; do
for part in "validated"; do
# use underscore-separated names in data directories.
local/data_prep.pl ${datadir}/ cv-${part} data/"$(echo ${part} | tr - _)"
local/data_prep.pl ${datadir} ${part} data/"$(echo "${part}_${lang}" | tr - _)"
done

# Kaldi Version Split
# ./utils/subset_data_dir_tr_cv.sh data/validated data/valid_train data/valid_test_dev
# ./utils/subset_data_dir_tr_cv.sh --cv-spk-percent 50 data/valid_test_dev data/valid_test data/valid_dev

# ESPNet Version (same as voxforge)
# consider duplicated sentences (does not consider speaker split)
# filter out the same sentences (also same text) of test&dev set from validated set
echo data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
local/split_tr_dt_et.sh data/validated_${lang} data/${train_set} data/${train_dev} data/${test_set}
fi

feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
Expand All @@ -82,18 +91,18 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Feature Generation"
fbankdir=fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for x in valid_train valid_dev valid_test; do
for x in ${train_set} ${train_dev} ${recog_set}; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 4 --write_utt2num_frames true \
data/${x} exp/make_fbank/${x} ${fbankdir}
utils/fix_data_dir.sh data/${x}
done
# Remove features with too long frames in training data
max_len=3000
mv data/valid_train/utt2num_frames data/valid_train/utt2num_frames.bak
awk -v max_len=${max_len} '$2 < max_len {print $1, $2}' data/valid_train/utt2num_frames.bak > data/valid_train/utt2num_frames
utils/filter_scp.pl data/valid_train/utt2num_frames data/valid_train/utt2spk > data/valid_train/utt2spk.new
mv data/valid_train/utt2spk.new data/valid_train/utt2spk
utils/fix_data_dir.sh data/valid_train
mv data/${train_set}/utt2num_frames data/${train_set}/utt2num_frames.bak
awk -v max_len=${max_len} '$2 < max_len {print $1, $2}' data/${train_set}/utt2num_frames.bak > data/${train_set}/utt2num_frames
utils/filter_scp.pl data/${train_set}/utt2num_frames data/${train_set}/utt2spk > data/${train_set}/utt2spk.new
mv data/${train_set}/utt2spk.new data/${train_set}/utt2spk
utils/fix_data_dir.sh data/${train_set}

# compute global CMVN
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
Expand Down

0 comments on commit a03766a

Please sign in to comment.