Merge pull request kaldi-asr#929 from danpovey/data_cleanup

Adding the underlying scripts for my refactored version of Vimal's da…
fanskyer · Jul 28, 2016 · b49874e · b49874e
2 parents ce7017f + 921fe30
commit b49874e
Show file tree

Hide file tree

Showing 20 changed files with 3,178 additions and 62 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,10 @@
 *.obj
 *.pyc
 
+# profiling files and core files
+*.nvprof
+core
+
 # Compiled Dynamic libraries
 *.so
 *.dylib

diff --git a/egs/ami/s5/local/chain/run_chain_common.sh b/egs/ami/s5/local/chain/run_chain_common.sh
@@ -87,9 +87,8 @@ fi
 
 if [ $stage -le 12 ]; then
   rm -rf data/$mic/${train_set}_min${min_seg_len}_hires
-  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-    --input-data-dir data/$mic/${train_set}_hires \
-    --output-data-dir data/$mic/${train_set}_min${min_seg_len}_hires
+  utils/data/combine_short_segments.sh \
+      data/$mic/${train_set}_hires $min_seg_len data/$mic/${train_set}_min${min_seg_len}_hires
 
   #extract ivectors for the new data
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \
@@ -101,9 +100,8 @@ if [ $stage -le 12 ]; then
 
  # combine the non-hires features for alignments/lattices
  rm -rf data/$mic/${latgen_train_set}_min${min_seg_len}
- steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-                   --input-data-dir data/$mic/${latgen_train_set} \
-                   --output-data-dir data/$mic/${latgen_train_set}_min${min_seg_len}
+ utils/data/combine_short_segments.sh \
+     data/$mic/${latgen_train_set} $min_seg_len data/$mic/${latgen_train_set}_min${min_seg_len}
 fi
 
 train_set=${train_set}_min${min_seg_len}

diff --git a/egs/aspire/s5/local/chain/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/run_tdnn_7b.sh
@@ -76,9 +76,8 @@ fi
 
 if [ $stage -le 9 ]; then
   rm -rf data/train_rvb_min${min_seg_len}_hires
-  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-    --input-data-dir data/train_rvb_hires \
-    --output-data-dir data/train_rvb_min${min_seg_len}_hires
+  utils/data/combine_short_segments.sh \
+      data/train_rvb_hires $min_seg_len data/train_rvb_min${min_seg_len}_hires
 
   #extract ivectors for the new data
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \
@@ -98,9 +97,8 @@ if [ $stage -le 9 ]; then
   spk_prefix="THISISUNIQUESTRING_"
   utils/copy_data_dir.sh --spk-prefix "$spk_prefix" --utt-prefix "$utt_prefix" \
     data/train data/train_temp_for_lats
-  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-                   --input-data-dir data/train_temp_for_lats \
-                   --output-data-dir data/train_min${min_seg_len}
+  utils/data/combine_short_segments.sh \
+      data/train_temp_for_lats $min_seg_len data/train_min${min_seg_len}
 fi
 
 if [ $stage -le 10 ]; then

diff --git a/egs/librispeech/s5/local/chain/run_chain_common.sh b/egs/librispeech/s5/local/chain/run_chain_common.sh
@@ -71,9 +71,8 @@ fi
 
 if [ $stage -le 12 ]; then
   rm -rf data/${train_set}_min${min_seg_len}_hires
-  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-    --input-data-dir data/${train_set}_hires \
-    --output-data-dir data/${train_set}_min${min_seg_len}_hires
+  utils/data/combine_short_segments.sh \
+     data/${train_set}_hires $min_seg_len data/${train_set}_min${min_seg_len}_hires
 
   #extract ivectors for the new data
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \
@@ -85,9 +84,8 @@ if [ $stage -le 12 ]; then
 
  # combine the non-hires features for alignments/lattices
  rm -rf data/${latgen_train_set}_min${min_seg_len}
- steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
-                   --input-data-dir data/${latgen_train_set} \
-                   --output-data-dir data/${latgen_train_set}_min${min_seg_len}
+ utils/data/combine_short_segments.sh \
+    data/${latgen_train_set} $min_seg_len data/${latgen_train_set}_min${min_seg_len}
 fi
 
 train_set=${train_set}_min${min_seg_len}

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2n.sh b/egs/swbd/s5c/local/chain/run_tdnn_2n.sh
@@ -190,9 +190,8 @@ if [ $stage -le 9 ]; then
   # get more iVector diversity.
   for s in "${suffix}" "${suffix}_hires"; do
 
-    steps/cleanup/combine_short_segments.py --minimum-duration ${min_segment_length} \
-      --input-data-dir data/train_nodup${s} \
-      --output-data-dir data/train_nodup${s}_ml${min_segment_length} \
+    utils/data/combine_short_segments.sh \
+      data/train_nodup${s} ${min_segment_length} data/train_nodup${s}_ml${min_segment_length}
 
     steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 1 data/train_nodup${s}_ml${min_segment_length} \
       data/train_nodup${s}_ml${min_segment_length}_max1

diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+set -e -o pipefail
+
+stage=0
+
+cmd=run.pl
+cleanup=true
+nj=4
+graph_opts=
+segmentation_opts=
+
+. ./path.sh
+. utils/parse_options.sh
+
+
+if [ $# -ne 5 ]; then
+  echo "Usage: $0 [options] <data> <lang> <srcdir> <dir> <cleaned-data>"
+  echo " This script does data cleanup to remove bad portions of transcripts and"
+  echo " may do other minor modifications of transcripts such as allowing repetitions"
+  echo " for disfluencies, and adding or removing non-scored words (by default:"
+  echo " words that map to 'silence phones')"
+  echo " Note: <srcdir> is expected to contain a GMM-based model, preferably a"
+  echo " SAT-trained one (see train_sat.sh)."
+  echo " If <srcdir> contains fMLLR transforms (trans.*) they are assumed to"
+  echo " be transforms corresponding to the data in <data>.  If <srcdir> is for different"
+  echo " dataset, and you're using SAT models, you should align <data> with <srcdir>"
+  echo " using align_fmllr.sh, and supply that directory as <srcdir>"
+  echo ""
+  echo "e.g. $0 data/train data/lang exp/tri3 exp/tri3_cleanup data/train_cleaned"
+  echo "Options:"
+  echo "  --stage <n>             # stage to run from, to enable resuming from partially"
+  echo "                          # completed run (default: 0)"
+  echo "  --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)"
+  echo "  --nj <n>                # number of parallel jobs to use in graph creation and"
+  echo "                          # decoding"
+  echo "  --segmentation-opts 'opts'  # Additional options to segment_ctm_edits.py."
+  echo "                              # Please run steps/cleanup/segment_ctm_edits.py"
+  echo "                              # without arguments to see allowed options."
+  echo "  --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh."
+  echo "                              # Please run steps/cleanup/make_biased_lm_graphs.sh"
+  echo "                              # without arguments to see allowed options."
+  echo "  --cleanup        <true|false>  # Clean up intermediate files afterward.  Default true."
+  exit 1
+
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+data_out=$5
+
+
+for f in $srcdir/{final.mdl,tree,cmvn_opts} $data/utt2spk $data/feats.scp $lang/words.txt $lang/oov.txt; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1
+  fi
+done
+
+mkdir -p $dir
+cp $srcdir/final.mdl $dir
+cp $srcdir/tree $dir
+cp $srcdir/cmvn_opts $dir
+cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: Building biased-language-model decoding graphs..."
+  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
+    --nj $nj --cmd "$decode_cmd" \
+     $data $lang $dir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Decoding with biased language models..."
+  transform_opt=
+  if [ -f $srcdir/trans.1 ]; then
+    # If srcdir contained trans.* then we assume they are fMLLR transforms for
+    # this data, and we use them.
+    transform_opt="--transform-dir $srcdir"
+  fi
+  # Note: the --beam 15.0 (vs. the default 13.0) does actually slow it
+  # down substantially, around 0.35xRT to 0.7xRT on tedlium.
+  # I want to test at some point whether it's actually necessary to have
+  # this largish beam.
+  steps/cleanup/decode_segmentation.sh \
+      --beam 15.0 --nj $nj --cmd "$cmd --mem 4G" $transform_opt \
+      --skip-scoring true --allow-partial false \
+       $dir $data $dir/lats
+
+  # the following is for diagnostics, e.g. it will give us the lattice depth.
+  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $lang $dir/lats
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Doing oracle alignment of lattices..."
+  steps/cleanup/lattice_oracle_align.sh \
+    --cmd "$decode_cmd" $data $lang $dir/lats $dir/lattice_oracle
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: using default values of non-scored words..."
+
+  # At the level of this script we just hard-code it that non-scored words are
+  # those that map to silence phones (which is what get_non_scored_words.py
+  # gives us), although this could easily be made user-configurable.  This list
+  # of non-scored words affects the behavior of several of the data-cleanup
+  # scripts; essentially, we view the non-scored words as negotiable when it
+  # comes to the reference transcript, so we'll consider changing the reference
+  # to match the hyp when it comes to these words.
+  steps/cleanup/get_non_scored_words.py $lang > $dir/non_scored_words.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
+  echo "   ... to fix reference mismatches involving non-scored words. "
+
+  $cmd $dir/log/modify_ctm_edits.log \
+    steps/cleanup/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
+    $dir/lattice_oracle/ctm_edits $dir/ctm_edits.modified
+
+  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
+  echo " a list of commonly-repeated words."
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
+  echo "  ... non-scored words that are next to errors."
+  $cmd $dir/log/taint_ctm_edits.log \
+       steps/cleanup/taint_ctm_edits.py $dir/ctm_edits.modified $dir/ctm_edits.tainted
+  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
+fi
+
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating segmentation from ctm-edits file."
+
+  $cmd $dir/log/segment_ctm_edits.log \
+    steps/cleanup/segment_ctm_edits.py \
+       $segmentation_opts \
+       --oov-symbol-file=$lang/oov.txt \
+      --ctm-edits-out=$dir/ctm_edits.segmented \
+      --word-stats-out=$dir/word_stats.txt \
+   $dir/non_scored_words.txt \
+   $dir/ctm_edits.tainted $dir/text $dir/segments
+
+  echo "$0: for global segmentation stats, including the amount of data retained at various processing stages,"
+  echo " ... see $dir/log/segment_ctm_edits.log"
+  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
+  echo "see $dir/word_stats.txt"
+  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out"
+  utils/data/subsegment_data_dir.sh ${data} $dir/segments $dir/text $data_out
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: recomputing CMVN stats for the new data"
+  # Caution: this script puts the CMVN stats in $data_out/data,
+  # e.g. data/train_cleaned/data.  This is not the general pattern we use.
+  steps/compute_cmvn_stats.sh $data_out $data_out/log $data_out/data
+fi
+
+if $cleanup; then
+  echo "$0: cleaning up intermediate files"
+  rm -r $dir/fsts $dir/HCLG.fsts.scp
+  rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts
+  rm $dir/lattice_oracle/lat.*.gz
+fi
+
+echo "$0: done."
diff --git a/egs/wsj/s5/steps/cleanup/combine_short_segments.py b/egs/wsj/s5/steps/cleanup/combine_short_segments.py
@@ -3,6 +3,7 @@
 # Copyright 2016 Vijayaditya Peddinti
 # Apache 2.0
 
+from __future__ import print_function
 import argparse
 import sys
 import os
@@ -15,10 +16,12 @@
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="""
+    **Warning, this script is deprecated.  Please use utils/data/combine_short_segments.sh**
     This script concatenates segments in the input_data_dir to ensure that"""
     " the segments in the output_data_dir have a specified minimum length.",
     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
+
     parser.add_argument("--minimum-duration", type=float, required = True,
                         help="Minimum duration of the segments in the output directory")
     parser.add_argument("--input-data-dir", type=str, required = True)
@@ -263,7 +266,8 @@ def CombineSegments(input_dir, output_dir, minimum_duration):
                 if not cur_utt_dur >= minimum_duration:
                     # this is a rare occurrence, better make the user aware of this
                     # situation and let them deal with it
-                    warnings.warn('Speaker {0} does not have enough utterances to satisfy the minimum duration constraint. Not modifying these utterances'.format(speaker))
+                    warnings.warn('Speaker {0} does not have enough utterances to satisfy the minimum duration '
+                                  'constraint. Not modifying these utterances'.format(speaker))
                     utt_index = utt_index + 1
                     continue
                 combined_duration = 0
@@ -292,6 +296,8 @@ def CombineSegments(input_dir, output_dir, minimum_duration):
     WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq)
 
 def Main():
+    print("""steps/cleanup/combine_short_segments.py: warning: this script is deprecated and will be removed.
+          Please use utils/data/combine_short_segments.sh""", file = sys.stderr)
     args = GetArgs()
 
     CheckFiles(args.input_data_dir)
@@ -311,4 +317,3 @@ def Main():
 if __name__ == "__main__":
     Main()
 
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,10 @@ @@
     *.obj
     *.pyc
+    # profiling files and core files
+    *.nvprof
+    core
     # Compiled Dynamic libraries
     *.so
     *.dylib
@@ Expand Down @@