Minor, mostly cosmetic updates to scripts

fanskyer · Jul 28, 2016 · ce7017f · ce7017f
1 parent 6d3de59
commit ce7017f
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 29 deletions.
diff --git a/egs/wsj/s5/utils/format_lm.sh b/egs/wsj/s5/utils/format_lm.sh
@@ -34,7 +34,7 @@ mkdir -p $out_dir
 
 echo "Converting '$lm' to FST"
 
-for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
+for f in phones.txt words.txt L.fst L_disambig.fst phones/ oov.int oov.txt; do
   cp -r $lang_dir/$f $out_dir
 done
 

diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -40,6 +40,11 @@ if [ ! -f $srcdir/utt2spk ]; then
   exit 1;
 fi
 
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
 set -e;
 set -o pipefail
 

diff --git a/egs/wsj/s5/utils/scoring/wer_ops_details.pl b/egs/wsj/s5/utils/scoring/wer_ops_details.pl
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 
-# These scripts are (or can be) used by scoring scripts to generate 
-# additional information (such as per-spk wer, per-sentence alignments and so on) 
-# during the scoring. See the wsj/local/score.sh script for example how 
+# These scripts are (or can be) used by scoring scripts to generate
+# additional information (such as per-spk wer, per-sentence alignments and so on)
+# during the scoring. See the wsj/local/score.sh script for example how
 # the scripts are used
-# For help and instructions about usage, see the bottom of this file, 
+# For help and instructions about usage, see the bottom of this file,
 # or call it with the parameter --help
- 
+
 use strict;
 use warnings;
 use utf8;
@@ -68,24 +68,24 @@ sub max {
   chomp;
   my @entries = split(" ", $_);
   next if  @entries < 2;
-  next if  ($entries[1] ne "hyp") and ($entries[1] ne "ref") ; 
+  next if  ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
   if (scalar @entries <= 2 ) {
     print STDERR "Warning: skipping entry \"$_\", either an  empty phrase or incompatible format\n" ;
     next;
   }
 
-  die "The input stream contains duplicate entry $entries[0] $entries[1]\n" 
+  die "The input stream contains duplicate entry $entries[0] $entries[1]\n"
     if exists $UTT{$entries[0]}->{$entries[1]};
   push @{$UTT{$entries[0]}->{$entries[1]}}, @entries[2..$#entries];
   #print join(" ", @{$UTT{$entries[0]}->{$entries[1]}}) . "\n";
   #print $_ . "\n";
 }
 
 for my $utterance( sort (keys %UTT) ) {
-  
-  die "The input stream does not contain entry \"hyp\" for utterance $utterance\n" 
+
+  die "The input stream does not contain entry \"hyp\" for utterance $utterance\n"
     unless exists $UTT{$utterance}->{"hyp"};
-  die "The input stream does not contain entry \"ref\" for utterance $utterance\n" 
+  die "The input stream does not contain entry \"ref\" for utterance $utterance\n"
     unless exists $UTT{$utterance}->{"ref"};
 
   my $hyp = $UTT{$utterance}->{"hyp"};
@@ -109,16 +109,15 @@ sub max {
       ;
     }
     $word_len = $q > $word_len ? $q : $word_len ;
-    
+
     my $d = length(sprintf("%d", $EDIT_OPS{$refw}->{$hypw}));
     $ops_len =  $d > $ops_len ? $d: $ops_len ;
   }
 }
 
-print STDERR "Determined max length of string: $word_len\n";
-print STDERR "Determined max length of number: $ops_len\n";
 if ($word_len > $max_size) {
-  print STDERR "Warning: we are limiting the width to $max_size\n";
+  ## We used to warn about this, but it was just confusing-- dan.
+  ## print STDERR "wer_ops_details.pl [info; affects only whitespace]: we are limiting the width to $max_size, max word len was $word_len\n";
   $word_len = $max_size
 };
 
@@ -143,15 +142,15 @@ =head1 NAME
 
 =head1 SYNOPSIS
 
-  wer_per_spk_details.pl 
-  
+  wer_per_spk_details.pl
+
   Options:
-    --special-symbol        special symbol used in align-text to denote empty word 
+    --special-symbol        special symbol used in align-text to denote empty word
                             in case insertion or deletion ("<eps>" by default)
     --help                  Print this help
 
 ==head1 DESCRIPTION
-  The program generates global statistic on how many time was each word 
+  The program generates global statistic on how many time was each word
   recognized correctly, confused as another word, incorrectly deleted or inserted.
   The output will contain similar info as the sclite dtl file, the format is,
   however, completely different.
@@ -175,7 +174,7 @@ =head1 SYNOPSIS
   Note:
     The input can contain other lines as well -- those will be ignored during
     reading the input. I.E. this is a completely legal input:
-      
+
       UTT-A ref  word-A   <eps>  word-B  word-C  word-D  word-E
       UTT-A hyp  word-A  word-A  word-B   <eps>  word-D  word-X
       UTT-A op      C       I       C       D       C       S

diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2010-2011  Microsoft Corporation 
+# Copyright 2010-2011  Microsoft Corporation
 #           2012-2013  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
@@ -27,8 +27,9 @@
 
 # If you give the --last option, it will just give you the n last utterances.
 
-# If you give the --spk-list option, it reads the speakers to keep from <speaker-list-file>"
-# (note, in this case there is no <num-utt> positional parameter; see usage message.)
+# If you give the --spk-list or --utt-list option, it reads the
+# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
+# in this case there is no <num-utt> positional parameter; see usage message.)
 
 
 shortest=false
@@ -97,7 +98,7 @@ fi
 export LC_ALL=C
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "subset_data_dir.sh: no such file $srcdir/utt2spk" 
+  echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -120,10 +121,10 @@ function do_filtering {
      [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
      [ -f $srcdir/reco2file_and_channel ] && \
        utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-     
+
      # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
      [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
-     
+
      rm $destdir/reco
   fi
   srcutts=`cat $srcdir/utt2spk | wc -l`
@@ -150,11 +151,11 @@ elif $speakers; then
     sort > $destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
   do_filtering; # bash function.
-  exit 0;  
+  exit 0;
 elif $perspk; then
   mkdir -p $destdir
   awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
-         for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } 
+         for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
          printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
   do_filtering; # bash function.
@@ -163,7 +164,7 @@ else
   if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
     echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
     exit 1;
-  fi 
+  fi
   mkdir -p $destdir || exit 1;
 
   ## scripting note: $shortest evaluates to true or false
-Original file line number
+Diff line change
@@ Expand Up / @@ -40,6 +40,11 @@ if [ ! -f $srcdir/utt2spk ]; then @@
       exit 1;
     fi
+    if [ "$destdir" == "$srcdir" ]; then
+      echo "$0: this script requires <srcdir> and <destdir> to be different."
+      exit 1
+    fi
     set -e;
     set -o pipefail
@@ Expand Down @@