Skip to content

Commit

Permalink
Minor, mostly cosmetic updates to scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
danpovey committed Jul 28, 2016
1 parent 6d3de59 commit ce7017f
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 29 deletions.
2 changes: 1 addition & 1 deletion egs/wsj/s5/utils/format_lm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ mkdir -p $out_dir

echo "Converting '$lm' to FST"

for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
for f in phones.txt words.txt L.fst L_disambig.fst phones/ oov.int oov.txt; do
cp -r $lang_dir/$f $out_dir
done

Expand Down
5 changes: 5 additions & 0 deletions egs/wsj/s5/utils/perturb_data_dir_speed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ if [ ! -f $srcdir/utt2spk ]; then
exit 1;
fi

if [ "$destdir" == "$srcdir" ]; then
echo "$0: this script requires <srcdir> and <destdir> to be different."
exit 1
fi

set -e;
set -o pipefail

Expand Down
37 changes: 18 additions & 19 deletions egs/wsj/s5/utils/scoring/wer_ops_details.pl
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
# limitations under the License.


# These scripts are (or can be) used by scoring scripts to generate
# additional information (such as per-spk wer, per-sentence alignments and so on)
# during the scoring. See the wsj/local/score.sh script for example how
# These scripts are (or can be) used by scoring scripts to generate
# additional information (such as per-spk wer, per-sentence alignments and so on)
# during the scoring. See the wsj/local/score.sh script for example how
# the scripts are used
# For help and instructions about usage, see the bottom of this file,
# For help and instructions about usage, see the bottom of this file,
# or call it with the parameter --help

use strict;
use warnings;
use utf8;
Expand Down Expand Up @@ -68,24 +68,24 @@ sub max {
chomp;
my @entries = split(" ", $_);
next if @entries < 2;
next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
if (scalar @entries <= 2 ) {
print STDERR "Warning: skipping entry \"$_\", either an empty phrase or incompatible format\n" ;
next;
}

die "The input stream contains duplicate entry $entries[0] $entries[1]\n"
die "The input stream contains duplicate entry $entries[0] $entries[1]\n"
if exists $UTT{$entries[0]}->{$entries[1]};
push @{$UTT{$entries[0]}->{$entries[1]}}, @entries[2..$#entries];
#print join(" ", @{$UTT{$entries[0]}->{$entries[1]}}) . "\n";
#print $_ . "\n";
}

for my $utterance( sort (keys %UTT) ) {
die "The input stream does not contain entry \"hyp\" for utterance $utterance\n"

die "The input stream does not contain entry \"hyp\" for utterance $utterance\n"
unless exists $UTT{$utterance}->{"hyp"};
die "The input stream does not contain entry \"ref\" for utterance $utterance\n"
die "The input stream does not contain entry \"ref\" for utterance $utterance\n"
unless exists $UTT{$utterance}->{"ref"};

my $hyp = $UTT{$utterance}->{"hyp"};
Expand All @@ -109,16 +109,15 @@ sub max {
;
}
$word_len = $q > $word_len ? $q : $word_len ;

my $d = length(sprintf("%d", $EDIT_OPS{$refw}->{$hypw}));
$ops_len = $d > $ops_len ? $d: $ops_len ;
}
}

print STDERR "Determined max length of string: $word_len\n";
print STDERR "Determined max length of number: $ops_len\n";
if ($word_len > $max_size) {
print STDERR "Warning: we are limiting the width to $max_size\n";
## We used to warn about this, but it was just confusing-- dan.
## print STDERR "wer_ops_details.pl [info; affects only whitespace]: we are limiting the width to $max_size, max word len was $word_len\n";
$word_len = $max_size
};

Expand All @@ -143,15 +142,15 @@ =head1 NAME
=head1 SYNOPSIS
wer_per_spk_details.pl
wer_per_spk_details.pl
Options:
--special-symbol special symbol used in align-text to denote empty word
--special-symbol special symbol used in align-text to denote empty word
in case insertion or deletion ("<eps>" by default)
--help Print this help
==head1 DESCRIPTION
The program generates global statistic on how many time was each word
The program generates global statistic on how many time was each word
recognized correctly, confused as another word, incorrectly deleted or inserted.
The output will contain similar info as the sclite dtl file, the format is,
however, completely different.
Expand All @@ -175,7 +174,7 @@ =head1 SYNOPSIS
Note:
The input can contain other lines as well -- those will be ignored during
reading the input. I.E. this is a completely legal input:
UTT-A ref word-A <eps> word-B word-C word-D word-E
UTT-A hyp word-A word-A word-B <eps> word-D word-X
UTT-A op C I C D C S
Expand Down
19 changes: 10 additions & 9 deletions egs/wsj/s5/utils/subset_data_dir.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2010-2011 Microsoft Corporation
# 2012-2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0

Expand Down Expand Up @@ -27,8 +27,9 @@

# If you give the --last option, it will just give you the n last utterances.

# If you give the --spk-list option, it reads the speakers to keep from <speaker-list-file>"
# (note, in this case there is no <num-utt> positional parameter; see usage message.)
# If you give the --spk-list or --utt-list option, it reads the
# speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note,
# in this case there is no <num-utt> positional parameter; see usage message.)


shortest=false
Expand Down Expand Up @@ -97,7 +98,7 @@ fi
export LC_ALL=C

if [ ! -f $srcdir/utt2spk ]; then
echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
exit 1;
fi

Expand All @@ -120,10 +121,10 @@ function do_filtering {
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/reco2file_and_channel ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel

# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
[ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm

rm $destdir/reco
fi
srcutts=`cat $srcdir/utt2spk | wc -l`
Expand All @@ -150,11 +151,11 @@ elif $speakers; then
sort > $destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
exit 0;
exit 0;
elif $perspk; then
mkdir -p $destdir
awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
Expand All @@ -163,7 +164,7 @@ else
if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
exit 1;
fi
fi
mkdir -p $destdir || exit 1;

## scripting note: $shortest evaluates to true or false
Expand Down

0 comments on commit ce7017f

Please sign in to comment.