[scripts] wenetspeech recipes (kaldi-asr#4647)

* wenetspeech recipes * small fix
tpoindex · Nov 29, 2021 · 5cd9c1e · 5cd9c1e
1 parent 6e03a3f
commit 5cd9c1e
Show file tree

Hide file tree

Showing 28 changed files with 2,515 additions and 0 deletions.
diff --git a/egs/wenetspeech/s5/RESULTS b/egs/wenetspeech/s5/RESULTS
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+#for part in train_l; do
+#  for testset in dev test_net test_meeting test_aishell1; do
+#    for model in mono tri1a tri1b tri2a tri3a tri3b 1a 1b 1c 1d; do
+#      grep WER exp/$part/$model/decode_$testset/scoring_kaldi/best_cer
+#      grep WER exp/$part/chain_cleaned/cnn_tdnn_${model}_sp/decode_${testset}_rnnlm/scoring_kaldi/best_cer
+#    done
+#  done
+#done
+
+# GMM model trained on "train_s" dataset
+%WER 64.63 [ 212208 / 328341, 4623 ins, 34363 del, 173222 sub ] exp/train_s/mono/decode_dev/cer_7_0.5
+%WER 41.22 [ 135341 / 328341, 4831 ins, 20064 del, 110446 sub ] exp/train_s/tri1a/decode_dev/cer_9_0.5
+%WER 38.58 [ 126661 / 328341, 4797 ins, 19021 del, 102843 sub ] exp/train_s/tri1b/decode_dev/cer_10_0.5
+%WER 33.44 [ 109802 / 328341, 4245 ins, 17674 del, 87883 sub ] exp/train_s/tri2a/decode_dev/cer_10_0.5
+%WER 32.61 [ 107082 / 328341, 4278 ins, 17269 del, 85535 sub ] exp/train_s/tri3a/decode_dev/cer_10_0.5
+%WER 32.23 [ 105811 / 328341, 4295 ins, 17348 del, 84168 sub ] exp/train_s/tri3b/decode_dev/cer_10_1.0
+
+%WER 73.82 [ 305897 / 414409, 6551 ins, 61355 del, 237991 sub ] exp/train_s/mono/decode_test_net/cer_6_0.5
+%WER 54.12 [ 224283 / 414409, 7416 ins, 41925 del, 174942 sub ] exp/train_s/tri1a/decode_test_net/cer_8_0.0
+%WER 51.68 [ 214182 / 414409, 7105 ins, 40864 del, 166213 sub ] exp/train_s/tri1b/decode_test_net/cer_8_0.5
+%WER 46.48 [ 192598 / 414409, 6324 ins, 41208 del, 145066 sub ] exp/train_s/tri2a/decode_test_net/cer_9_0.0
+%WER 45.31 [ 187763 / 414409, 6026 ins, 41045 del, 140692 sub ] exp/train_s/tri3a/decode_test_net/cer_9_0.0
+%WER 45.06 [ 186716 / 414409, 6580 ins, 40458 del, 139678 sub ] exp/train_s/tri3b/decode_test_net/cer_9_0.0
+
+
+%WER 93.37 [ 205753 / 220360, 1100 ins, 93906 del, 110747 sub ] exp/train_s/mono/decode_test_meeting/cer_3_1.0
+%WER 84.03 [ 185167 / 220360, 1854 ins, 81426 del, 101887 sub ] exp/train_s/tri1a/decode_test_meeting/cer_6_1.0
+%WER 83.06 [ 183027 / 220360, 2121 ins, 78520 del, 102386 sub ] exp/train_s/tri1b/decode_test_meeting/cer_6_1.0
+%WER 79.25 [ 174642 / 220360, 2251 ins, 75982 del, 96409 sub ] exp/train_s/tri2a/decode_test_meeting/cer_6_1.0
+%WER 79.15 [ 174410 / 220360, 2210 ins, 78861 del, 93339 sub ] exp/train_s/tri3a/decode_test_meeting/cer_5_1.0
+%WER 78.79 [ 173629 / 220360, 2466 ins, 76857 del, 94306 sub ] exp/train_s/tri3b/decode_test_meeting/cer_5_1.0
+
+# DNN model trained on "train_s" dataset with 4 epochs(1c) and 10 epochs(1d)
+%WER 13.28 [ 43599 / 328341, 2197 ins, 10108 del, 31294 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1c_sp/decode_dev_rnnlm/cer_8_0.0
+%WER 11.70 [ 38425 / 328341, 2145 ins, 8988 del, 27292 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1d_sp/decode_dev_rnnlm/cer_8_0.0
+
+%WER 20.35 [ 84329 / 414409, 3023 ins, 21600 del, 59706 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1c_sp/decode_test_net_rnnlm/cer_8_0.0
+%WER 17.43 [ 72230 / 414409, 2616 ins, 20376 del, 49238 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1d_sp/decode_test_net_rnnlm/cer_9_0.0
+
+%WER 45.06 [ 99301 / 220360, 2011 ins, 46770 del, 50520 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1c_sp/decode_test_meeting_rnnlm/cer_6_0.0
+%WER 37.27 [ 82129 / 220360, 2032 ins, 36563 del, 43534 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1d_sp/decode_test_meeting_rnnlm/cer_7_0.0
+
+%WER 8.73 [ 9151 / 104765, 427 ins, 784 del, 7940 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1c_sp/decode_test_aishell1_rnnlm/cer_9_0.5
+%WER 7.66 [ 8029 / 104765, 459 ins, 767 del, 6803 sub ] exp/train_s/chain_cleaned/cnn_tdnn_1d_sp/decode_test_aishell1_rnnlm/cer_10_0.0
+
+# GMM model trained on "train_m" dataset
+%WER 67.25 [ 220817 / 328341, 4778 ins, 36049 del, 179990 sub ] exp/train_m/mono/decode_dev/cer_7_0.5
+%WER 41.10 [ 134934 / 328341, 4925 ins, 20022 del, 109987 sub ] exp/train_m/tri1a/decode_dev/cer_9_0.5
+%WER 38.28 [ 125676 / 328341, 4904 ins, 19116 del, 101656 sub ] exp/train_m/tri1b/decode_dev/cer_10_0.5
+%WER 32.36 [ 106238 / 328341, 4682 ins, 16246 del, 85310 sub ] exp/train_m/tri2a/decode_dev/cer_9_0.5
+%WER 31.24 [ 102564 / 328341, 4247 ins, 16735 del, 81582 sub ] exp/train_m/tri3a/decode_dev/cer_10_0.5
+%WER 29.91 [ 98212 / 328341, 4483 ins, 15431 del, 78298 sub ] exp/train_m/tri3b/decode_dev/cer_10_0.5
+
+%WER 76.58 [ 317351 / 414409, 6960 ins, 63259 del, 247132 sub ] exp/train_m/mono/decode_test_net/cer_7_0.0
+%WER 54.04 [ 223962 / 414409, 7483 ins, 41451 del, 175028 sub ] exp/train_m/tri1a/decode_test_net/cer_8_0.0
+%WER 51.33 [ 212729 / 414409, 7345 ins, 41089 del, 164295 sub ] exp/train_m/tri1b/decode_test_net/cer_9_0.0
+%WER 45.57 [ 188862 / 414409, 6243 ins, 40543 del, 142076 sub ] exp/train_m/tri2a/decode_test_net/cer_9_0.0
+%WER 44.04 [ 182488 / 414409, 6126 ins, 40500 del, 135862 sub ] exp/train_m/tri3a/decode_test_net/cer_9_0.0
+%WER 42.75 [ 177165 / 414409, 6496 ins, 38199 del, 132470 sub ] exp/train_m/tri3b/decode_test_net/cer_8_0.5
+
+%WER 93.13 [ 205227 / 220360, 1154 ins, 94004 del, 110069 sub ] exp/train_m/mono/decode_test_meeting/cer_4_1.0
+%WER 83.54 [ 184095 / 220360, 1911 ins, 83017 del, 99167 sub ] exp/train_m/tri1a/decode_test_meeting/cer_6_1.0
+%WER 82.37 [ 181518 / 220360, 2061 ins, 81027 del, 98430 sub ] exp/train_m/tri1b/decode_test_meeting/cer_6_1.0
+%WER 78.99 [ 174063 / 220360, 2093 ins, 80921 del, 91049 sub ] exp/train_m/tri2a/decode_test_meeting/cer_6_1.0
+%WER 79.45 [ 175072 / 220360, 2171 ins, 84058 del, 88843 sub ] exp/train_m/tri3a/decode_test_meeting/cer_5_1.0
+%WER 78.65 [ 173307 / 220360, 2432 ins, 80967 del, 89908 sub ] exp/train_m/tri3b/decode_test_meeting/cer_5_1.0
+
+# DNN model trained on "train_m" dataset
+%WER 9.81 [ 32223 / 328341, 2071 ins, 9072 del, 21080 sub ] exp/train_m/chain_cleaned/cnn_tdnn_1c_sp/decode_dev_rnnlm/cer_8_0.0
+%WER 14.19 [ 58824 / 414409, 2739 ins, 15110 del, 40975 sub ] exp/train_m/chain_cleaned/cnn_tdnn_1c_sp/decode_test_net_rnnlm/cer_8_0.0
+%WER 28.22 [ 62178 / 220360, 2256 ins, 26453 del, 33469 sub ] exp/train_m/chain_cleaned/cnn_tdnn_1c_sp/decode_test_meeting_rnnlm/cer_7_0.0
+%WER 5.93 [ 6217 / 104765, 317 ins, 542 del, 5358 sub ] exp/train_m/chain_cleaned/cnn_tdnn_1c_sp/decode_test_aishell1_rnnlm/cer_10_0.0
+
+# GMM model trained on "train_l" dataset
+%WER 70.87 [ 232694 / 328341, 4723 ins, 39336 del, 188635 sub ] exp/train_l/mono/decode_dev/cer_8_0.5
+%WER 41.02 [ 134678 / 328341, 5131 ins, 20003 del, 109544 sub ] exp/train_l/tri1a/decode_dev/cer_9_0.5
+%WER 37.94 [ 124557 / 328341, 4802 ins, 19239 del, 100516 sub ] exp/train_l/tri1b/decode_dev/cer_10_0.5
+%WER 32.21 [ 105757 / 328341, 4143 ins, 17475 del, 84139 sub ] exp/train_l/tri2a/decode_dev/cer_10_0.5
+%WER 31.00 [ 101795 / 328341, 4521 ins, 15858 del, 81416 sub ] exp/train_l/tri3a/decode_dev/cer_9_0.5
+%WER 29.62 [ 97271 / 328341, 4505 ins, 15765 del, 77001 sub ] exp/train_l/tri3b/decode_dev/cer_10_1.0
+
+%WER 79.54 [ 329635 / 414409, 7714 ins, 61459 del, 260462 sub ] exp/train_l/mono/decode_test_net/cer_7_0.0
+%WER 54.07 [ 224090 / 414409, 7677 ins, 41520 del, 174893 sub ] exp/train_l/tri1a/decode_test_net/cer_8_0.0
+%WER 51.10 [ 211745 / 414409, 7229 ins, 40413 del, 164103 sub ] exp/train_l/tri1b/decode_test_net/cer_8_0.5
+%WER 45.46 [ 188382 / 414409, 6299 ins, 40447 del, 141636 sub ] exp/train_l/tri2a/decode_test_net/cer_9_0.0
+%WER 44.01 [ 182394 / 414409, 6798 ins, 38434 del, 137162 sub ] exp/train_l/tri3a/decode_test_net/cer_8_0.0
+%WER 42.57 [ 176416 / 414409, 6446 ins, 38199 del, 131771 sub ] exp/train_l/tri3b/decode_test_net/cer_8_0.5
+
+%WER 93.76 [ 206601 / 220360, 1354 ins, 86016 del, 119231 sub ] exp/train_l/mono/decode_test_meeting/cer_5_0.5
+%WER 83.78 [ 184618 / 220360, 2014 ins, 80637 del, 101967 sub ] exp/train_l/tri1a/decode_test_meeting/cer_6_1.0
+%WER 82.89 [ 182659 / 220360, 2276 ins, 79925 del, 100458 sub ] exp/train_l/tri1b/decode_test_meeting/cer_6_1.0
+%WER 79.19 [ 174503 / 220360, 2277 ins, 81784 del, 90442 sub ] exp/train_l/tri2a/decode_test_meeting/cer_6_1.0
+%WER 79.43 [ 175039 / 220360, 2281 ins, 84042 del, 88716 sub ] exp/train_l/tri3a/decode_test_meeting/cer_5_0.5
+%WER 78.91 [ 173876 / 220360, 2414 ins, 82553 del, 88909 sub ] exp/train_l/tri3b/decode_test_meeting/cer_5_1.0
+
+# DNN model trained on "train_l" dataset: 1a(without SpecAug and Ivector), 1b(with SpecAug), 1c(with SpecAug and Ivector)
+%WER 9.40 [ 30871 / 328341, 2078 ins, 9113 del, 19680 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1a_sp/decode_dev_rnnlm/cer_8_0.0
+%WER 9.31 [ 30555 / 328341, 2325 ins, 7836 del, 20394 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1b_sp/decode_dev_rnnlm/cer_7_0.0
+%WER 9.07 [ 29777 / 328341, 2340 ins, 7822 del, 19615 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1c_sp/decode_dev_rnnlm/cer_7_0.0
+
+%WER 13.33 [ 55261 / 414409, 2577 ins, 15192 del, 37492 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1a_sp/decode_test_net_rnnlm/cer_9_0.0
+%WER 13.38 [ 55443 / 414409, 2883 ins, 13857 del, 38703 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1b_sp/decode_test_net_rnnlm/cer_8_0.0
+%WER 12.83 [ 53149 / 414409, 2897 ins, 13210 del, 37042 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1c_sp/decode_test_net_rnnlm/cer_8_0.0
+
+%WER 29.90 [ 65888 / 220360, 1935 ins, 30870 del, 33083 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1a_sp/decode_test_meeting_rnnlm/cer_7_0.0
+%WER 29.05 [ 64016 / 220360, 2184 ins, 28412 del, 33420 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1b_sp/decode_test_meeting_rnnlm/cer_7_0.0
+%WER 24.72 [ 54477 / 220360, 2154 ins, 23169 del, 29154 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1c_sp/decode_test_meeting_rnnlm/cer_7_0.0
+
+%WER 5.41 [ 5672 / 104765, 294 ins, 453 del, 4925 sub ] exp/train_l/chain_cleaned/cnn_tdnn_1c_sp/decode_test_aishell1_rnnlm/cer_10_0.0
diff --git a/egs/wenetspeech/s5/cmd.sh b/egs/wenetspeech/s5/cmd.sh
@@ -0,0 +1,16 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 5G --config conf/queue_no_k20.conf"
+export decode_cmd="queue.pl --mem 10G --config conf/queue_no_k20.conf"
+export egs_cmd="queue.pl --mem 10G --config conf/queue_no_k20.conf"
+export mkgraph_cmd="queue.pl --mem 20G --config conf/queue_no_k20.conf"
diff --git a/egs/wenetspeech/s5/conf/decode.config b/egs/wenetspeech/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/wenetspeech/s5/conf/mfcc.conf b/egs/wenetspeech/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/wenetspeech/s5/conf/mfcc_hires.conf b/egs/wenetspeech/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40    # similar to Google's setup.
+--num-ceps=40        # there is no dimensionality reduction.
+--low-freq=20        # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                     # there might be some information at the low end.
+--high-freq=-400     # high cutoff frequently, relative to Nyquist of 16000 (=15600)
diff --git a/egs/wenetspeech/s5/conf/online_cmvn.conf b/egs/wenetspeech/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/wenetspeech/s5/conf/online_pitch.conf b/egs/wenetspeech/s5/conf/online_pitch.conf
@@ -0,0 +1,31 @@
+## This config is given by conf/make_pitch_online.sh to the program compute-and-process-kaldi-pitch-feats,
+## and is copied by steps/online/nnet2/prepare_online_decoding.sh and similar scripts, to be given
+## to programs like online2-wav-nnet2-latgen-faster.
+## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
+## are the same as that those which will generated in online decoding; this enables us to train
+## in a way that's compatible with online decoding.
+##
+
+## most of these options relate to the post-processing rather than the pitch
+## extraction itself.
+--add-raw-log-pitch=true   ## this is intended for input to neural nets, so our
+                           ## approach is "throw everything in and see what
+                           ## sticks".
+--normalization-left-context=75
+--normalization-right-context=50 # We're removing some of the right-context
+                                 # for the normalization.   Would normally be 75.
+                                 #
+                                 # Note: our changes to the (left,right) context
+                                 # from the defaults of (75,75) to (75,50) will
+                                 # almost certainly worsen results, but will
+                                 # reduce latency.
+--frames-per-chunk=10    ## relates to offline simulation of online decoding; 1
+                         ## would be equivalent to getting in samples one by
+                         ## one.
+--simulate-first-pass-online=true  ## this make the online-pitch-extraction code
+                                   ## output the 'first-pass' features, which
+                                   ## are less accurate than the final ones, and
+                                   ## which are the only features the neural-net
+                                   ## decoding would ever see (since we can't
+                                   ## afford to do lattice rescoring in the
+                                   ## neural-net code
diff --git a/egs/wenetspeech/s5/conf/queue_no_k20.conf b/egs/wenetspeech/s5/conf/queue_no_k20.conf
@@ -0,0 +1,13 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_k20=true
+option allow_k20=true
+option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
diff --git a/egs/wenetspeech/s5/local/chain/run_chain_common.sh b/egs/wenetspeech/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/wenetspeech/s5/local/chain/run_cnn_tdnn.sh b/egs/wenetspeech/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1c.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh