Skip to content

Commit

Permalink
Merge pull request kaldi-asr#581 from cmusphinx/master
Browse files Browse the repository at this point in the history
More accurate language folder check for phonetic langs
  • Loading branch information
danpovey committed Mar 23, 2016
2 parents 80dffc9 + 177346e commit 843d72d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
6 changes: 4 additions & 2 deletions egs/wsj/s5/utils/make_phone_bigram_lang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ rm $lang_out/phones/wdisambig* 2>/dev/null # ditto this.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl
echo -n > $lang_out/phones/wdisambig.txt
echo -n > $lang_out/phones/wdisambig_phones.int
echo -n > $lang_out/phones/wdisambig_words.int

# Let OOV symbol be the first phone. This is arbitrary, it's just
# so that validate_lang.pl succeeds. We should never actually use
Expand Down Expand Up @@ -117,5 +120,4 @@ utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang
# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst

utils/validate_lang.pl $lang_out || exit 1;
echo "$0: ignore warnings RE disambiguation symbols from validate_lang.pl (these are expected)"
utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1;
49 changes: 29 additions & 20 deletions egs/wsj/s5/utils/validate_lang.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,24 @@


$skip_det_check = 0;
$skip_disambig_check = 0;

if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
$skip_det_check = 1;
shift @ARGV;
}

if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
$skip_disambig_check = 1;
shift @ARGV;
}

if (@ARGV != 1) {
print "Usage: $0 [options] <lang_directory>\n";
print "e.g.: $0 data/lang\n";
print "Options:\n";
print " --skip-determinization-check (this flag causes it to skip a time consuming check).\n";
print " --skip-disambig-check (this flag causes it to skip a disambig check in phone bigram models).\n";
exit(1);
}

Expand Down Expand Up @@ -279,7 +286,7 @@ sub check_disjoint {
if (!open(N, "<$lang/phones/nonsilence.txt")) {
$exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";
}
if (!open(D, "<$lang/phones/disambig.txt")) {
if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) {
$exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";
}

Expand Down Expand Up @@ -374,7 +381,7 @@ sub check_summation {
if (scalar(keys %nonsilence) == 0) {
$exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";
}
if (scalar(keys %disambig) == 0) {
if (!$skip_disambig_check && scalar(keys %disambig) == 0) {
$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
}

Expand Down Expand Up @@ -419,8 +426,11 @@ sub check_summation {
check_disjoint; print "\n";
check_summation; print "\n";

@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
@list1 = ("context_indep", "nonsilence", "silence", "optional_silence");
@list2 = ("roots", "sets");
if (!$skip_disambig_check) {
push(@list1, "disambig");
}
foreach (@list1) {
check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
}
Expand All @@ -431,10 +441,7 @@ sub check_summation {
check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
} else {
print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
$warning = 1;
} else {
if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) {
print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
$exit = 1;
}
Expand Down Expand Up @@ -468,19 +475,21 @@ sub check_summation {
$success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n";
print "\n";

# Check disambiguation symbols -------------------------------
print "Checking disambiguation symbols: #0 and #1\n";
if (scalar(keys %disambig) == 0) {
$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
}
if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
print "--> $lang/phones/disambig.txt is OK\n\n";
} else {
print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
print "--> this would not be OK with a conventional ARPA-type language\n";
print "--> model or a conventional lexicon (L.fst)\n";
$warning = 1;
if (!$skip_disambig_check) {
# Check disambiguation symbols -------------------------------
print "Checking disambiguation symbols: #0 and #1\n";
if (scalar(keys %disambig) == 0) {
$warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
}
if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
print "--> $lang/phones/disambig.txt is OK\n\n";
} else {
print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
print "--> this would not be OK with a conventional ARPA-type language\n";
print "--> model or a conventional lexicon (L.fst)\n";
$warning = 1;
}
}


Expand Down

0 comments on commit 843d72d

Please sign in to comment.