sandbox/language_id: Merging in trunk

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/language_id@4569 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
christinem · Oct 30, 2014 · 62087d0 · 62087d0
2 parents 463b062 + 2420985
commit 62087d0
Show file tree

Hide file tree

Showing 655 changed files with 37,267 additions and 7,020 deletions.
diff --git a/.gitignore b/.gitignore
@@ -381,6 +381,7 @@
 
 # /src/latbin/
 /src/latbin/.depend.mk
+/src/latbin/lattice-lmrescore-const-arpa
 /src/latbin/lattice-prune
 /src/latbin/lattice-rmali
 /src/latbin/lattice-compose
@@ -434,6 +435,9 @@
 /src/lm/.depend.mk
 /src/lm/lm-lib-test
 
+# /src/lmbin/
+src/lmbin/arpa-to-const-arpa
+
 # /src/matrix/
 /src/matrix/.depend.mk
 /src/matrix/Matrix.vcxproj
@@ -821,6 +825,7 @@
 /src/nnet2bin/nnet1-to-raw-nnet
 /src/nnet2bin/raw-nnet-copy
 /src/online2bin/apply-cmvn-online
+/src/online2bin/online2-wav-nnet2-am-compute
 /src/online2bin/compress-uncompress-speex
 /src/online2bin/extend-wav-with-silence
 /src/online2bin/ivector-extract-online2
@@ -849,3 +854,6 @@
 /src/online2bin/.depend.mk
 /src/online2/.depend.mk
 /src/ivector/.depend.mk
+/egs/librispeech/s5/data
+/egs/librispeech/s5/mfcc
+/egs/librispeech/s5/exp
diff --git a/egs/ami/s5/local/ami_format_data.sh b/egs/ami/s5/local/ami_format_data.sh
@@ -29,7 +29,7 @@ gunzip -c "$arpa_lm" | \
    utils/remove_oovs.pl /dev/null | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
      --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon > data/lang_test/G.fst
+    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
   fstisstochastic data/lang_test/G.fst
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"

diff --git a/egs/ami/s5/path.sh b/egs/ami/s5/path.sh
@@ -8,7 +8,7 @@ KALDI_ROOT=/gpfs/scratch/s1136550/kaldi-code
 KALDISRC=$KALDI_ROOT/src
 KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin  
 KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
-KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/tiedbin
+KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin
 
 FSTBIN=$KALDI_ROOT/tools/openfst/bin
 LMBIN=$KALDI_ROOT/tools/irstlm/bin

diff --git a/egs/aurora4/s5/local/aurora4_format_data.sh b/egs/aurora4/s5/local/aurora4_format_data.sh
@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
     utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
       --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $test/G.fst
+     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833

diff --git a/egs/aurora4/s5/local/wsj_format_data.sh b/egs/aurora4/s5/local/wsj_format_data.sh
@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
     utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
       --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $test/G.fst
+     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833

diff --git a/egs/aurora4/s5/local/wsj_format_local_lms.sh b/egs/aurora4/s5/local/wsj_format_local_lms.sh
@@ -25,28 +25,28 @@ gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_tgpr/G.fst
 
 gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_tg/G.fst
 
 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_fg/G.fst
 
 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_fgpr/G.fst
 
 exit 0;
diff --git a/egs/babel/s5/local/arpa2G.sh b/egs/babel/s5/local/arpa2G.sh
@@ -46,7 +46,7 @@ gunzip -c $lmfile | \
     utils/s2eps.pl | \
     fstcompile --isymbols=$langdir/words.txt \
     --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon > $destdir/G.fst || exit 1
+    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
 fstisstochastic $destdir/G.fst || true
 
 exit 0
diff --git a/egs/babel/s5/local/arpa2G_syllables.sh b/egs/babel/s5/local/arpa2G_syllables.sh
@@ -44,7 +44,7 @@ gunzip -c $lmfile | \
     fstcompile --isymbols=$langdir/words.txt \
     --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon > $destdir/G.fst || exit 1
+    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
 
 fstisstochastic $destdir/G.fst || true
 

diff --git a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
@@ -211,7 +211,7 @@ gunzip -c $gzipped_ARPA_LM | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $lang/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
   fstisstochastic $lang/G.fst
 
 ##################################################################

diff --git a/egs/babel/s5/local/wsj_format_data.sh b/egs/babel/s5/local/wsj_format_data.sh
@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
     utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
       --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $test/G.fst
+     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833

diff --git a/egs/babel/s5/local/wsj_format_local_lms.sh b/egs/babel/s5/local/wsj_format_local_lms.sh
@@ -27,28 +27,28 @@ gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_tgpr/G.fst
 
 gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_tg/G.fst
 
 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_fg/G.fst
 
 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
   arpa2fst - | fstprint | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
       --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
+     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
   fstisstochastic data/lang_test_bd_fgpr/G.fst
 
 exit 0;
diff --git a/egs/babel/s5b/EXAMPLE.vietnamese b/egs/babel/s5b/EXAMPLE.vietnamese
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+#This is an example sequence of commands for running the default Kaldi Babel OP1 system
+#It is not assumed that you will run it as a script, even though you can try :)
+
+./run-1-main.sh
+./run-2a-nnet-ensemble-gpu.sh
+./run-2b-bnf.sh --semisupervised false --ali-dir exp/tri5_ali/
+./run-3b-bnf-sgmm.sh --semisupervised false
+./run-3b-bnf-nnet.sh --semisupervised false
+
+##Training of the automatic segmenter
+./run-2-segmentation.sh
+
+##Decoding the automatic segmentation of dev2h subset. dev2h.pem would mean decoding
+##the dev2h subset using the officialy provided segmentation. 
+##Also possible to run dev10h.pem, dev10h.uem, dev10h.seg and so on...
+./run-4-anydecode.sh --dir dev2h.seg
+./run-4b-anydecode-bnf.sh --dir dev2h.seg --semisupervised false --extra-kws true
+
+##Decoding of the unsupervivsed data
+./run-4-anydecode.sh --dir unsup.seg --skip-kws true --skip-stt true
+./run-4b-anydecode-bnf.sh --dir unsup.seg --skip-kws true --skip-stt true --semisupervised false  
+
+##Get the one-best path and the weights for frame-weighting of posteriors
+./local/best_path_weights.sh --cmd "$train_cmd" data/unsup.seg/  data/lang \
+  exp/tri6b_nnet/decode_unsup.seg/ \
+  exp/sgmm5_mmi_b0.1/decode_fmllr_unsup.seg_it1/ \
+  exp_bnf/sgmm7_mmi_b0.1/decode_fmllr_unsup.seg_it1 \
+  exp_bnf/tri7_nnet/decode_unsup.seg \
+  exp_bnf_semisup/best_path_weights/unsup.seg
+
+##Semisupervised bottleneck system training (initial setup)
+./run-2b-bnf.sh --semisupervised true --ali-model exp/tri6b_nnet/ \
+    --weights-dir exp/best_path_weights/unsup.seg/decode_unsup.seg/
+
+##Semisup training, SGMM+bMMI on the top of the BN features
+./run-3b-bnf-sgmm.sh --semisupervised true
+##Semisup training, pNorm DNN  on the top of the BN features
+./run-3b-bnf-nnet.sh --semisupervised true
+
+##And decoding again. We decode the unsup.seg again to do the second run of the
+##semisupervised training
+./run-4b-anydecode-bnf.sh --dir dev2h.seg --semisupervised true --extra-kws true
+./run-4b-anydecode-bnf.sh --dir unsup.seg --skip-kws true --skip-stt true --semisupervised true
+
+##One-best output and frame weights for the second run of the semisup training
+./local/best_path_weights.sh --cmd "$train_cmd" data/unsup.seg/  data/lang \
+  exp_bnf_semisup/sgmm7_mmi_b0.1/decode_fmllr_unsup.seg_it1 \
+  exp_bnf_semisup/tri7_nnet/decode_unsup.seg \
+  exp/tri6b_nnet/decode_unsup.seg/ \
+  exp/sgmm5_mmi_b0.1/decode_fmllr_unsup.seg_it1/ \
+  exp_bnf/sgmm7_mmi_b0.1/decode_fmllr_unsup.seg_it1 \
+  exp_bnf/tri7_nnet/decode_unsup.seg \
+  exp_bnf_semisup2/best_path_weights/unsup.seg
+
+##Second run of the semisup training
+./run-2b-bnf.sh --unsup-string "_semisup2" --semisupervised true --ali-model exp/tri6b_nnet/ \
+    --weights-dir exp_bnf_semisup2/best_path_weights/unsup.seg/decode_fmllr_unsup.seg_it1/
+
+./run-3b-bnf-sgmm.sh --semisupervised true --unsup_string "_semisup2"
+./run-3b-bnf-nnet.sh --semisupervised true --unsup_string "_semisup2"
+
+##Decode again to see if we got an improvement
+./run-4b-anydecode-bnf.sh --dir dev2h.seg --semisupervised true --unsup_string "_semisup2" --extra-kws true
+
+
+##Decoding of the dev10h (all systems, all stages)
+./run-4-anydecode.sh --dir dev10h.seg --extra-kws true 
+./run-4b-anydecode-bnf.sh --dir dev10h.seg --semisupervised false --extra-kws true
+./run-4b-anydecode-bnf.sh --dir dev10h.seg --semisupervised true --extra-kws true
+./run-4b-anydecode-bnf.sh --dir dev10h.seg --semisupervised true --extra-kws true --unsup_string "_semisup2"
+
+##Decoding of the shadow.seg (combination of dev10h.seg and eval.seg)
+##We did this for eval run as a kind of "sanity check" -- we check the shadow.seg/dev10h.seg subset
+##performance vs the standalone dev10h.seg performance to catch (hopefully) possible problems
+./run-4-anydecode.sh --dir shadow.seg --extra-kws true 
+./run-4b-anydecode-bnf.sh --dir shadow.seg --semisupervised false --extra-kws true
+./run-4b-anydecode-bnf.sh --dir shadow.seg --semisupervised true --extra-kws true
+./run-4b-anydecode-bnf.sh --dir shadow.seg --semisupervised true --extra-kws true --unsup_string "_semisup2"
+
+
+
+#This prepares for separation/split of the shadow dataset into the devset, which we can evaluate
+# and the eval set, which we will submit
+#Note: we do this only once, for ./data, as we do not really need anything else 
+#just the file lists...
+#NB: there was a oversight in one of the scripts that was causing thectm files contain
+#BN: incorrect channel info (A instead of 1)
+#NB: To fix that, you can run something like this:
+#NB: find exp/ -name "shadow.seg.ctm" | xargs -t -n 1 sed -i'.bakx' 's/ A / 1 /g'
+./local/nist_eval/create_compound_set.sh --evlset eval.seg --devset dev10h.seg --tgtdir data/shadow.seg
+
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg dev10h.seg exp/tri6b_nnet/decode_shadow.seg
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg eval.seg   exp/tri6b_nnet/decode_shadow.seg
+
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg dev10h.seg exp/sgmm5_mmi_b0.1/decode_*shadow.seg*
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg eval.seg   exp/sgmm5_mmi_b0.1/decode_*shadow.seg*
+
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg dev10h.seg exp_bnf/sgmm7_mmi_b0.1/decode_*shadow.seg*
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg eval.seg   exp_bnf/sgmm7_mmi_b0.1/decode_*shadow.seg*
+
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg dev10h.seg exp_bnf_semisup/sgmm7_mmi_b0.1/decode_*shadow.seg*
+./local/nist_eval/filter_data.sh  --cmd "$decode_cmd"  data/shadow.seg eval.seg   exp_bnf_semisup/sgmm7_mmi_b0.1/decode_*shadow.seg*
+
+#The following commands will actually do two things
+#a) looking at the performance of the dataset --master <dataset> they will figure out the correct LMW
+#b) symlink the appropriate evaluation result file under the correct EXPID into the ./release directory
+#Warning: it's a lot of files so it's easy to get confused! 
+./local/nist_eval/make_release.sh --dryrun false --dir exp/sgmm5_mmi_b0.1 --data data/shadow.seg --master dev10h.seg lang.conf ./release
+./local/nist_eval/make_release.sh --dryrun false --dir exp/tri6b_nnet --data data/shadow.seg --master dev10h.seg lang.conf ./release
+./local/nist_eval/make_release.sh --dryrun false --dir exp_bnf/sgmm7_mmi_b0.1 --data data/shadow.seg --master dev10h.seg lang.conf ./release
+./local/nist_eval/make_release.sh --dryrun false --dir exp_bnf_semisup/sgmm7_mmi_b0.1 --extrasys SEMISUPX --data data/shadow.seg --master dev10h.seg lang.conf ./release
+
+#Combine results (what we call 4way-combo)
+
diff --git a/egs/babel/s5b/conf/common.fullLP b/egs/babel/s5b/conf/common.fullLP
@@ -52,9 +52,9 @@ if [[ `hostname` == *.tacc.utexas.edu ]] ; then
   sgmm_mmi_extra_opts=(--cmd "local/lonestar.py -pe smp 2")
   dnn_denlats_extra_opts=( --num-threads 2 )
 
-  dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
                          --parallel-opts "-pe smp 16" )
-  dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 8 --num-threads 1)
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1)
 
   dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1)
   dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1)
@@ -67,9 +67,9 @@ else
   sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=3.2G,ram_free=3.2G")
   dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
 
-  dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
                          --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
-  dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 8 --num-threads 1 \
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1 \
                          --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
   dnn_parallel_opts="-l gpu=1"
   dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1 \
@@ -86,19 +86,14 @@ wip=0.5
 
 phoneme_mapping=
 
-extend_lexicon=true
-unk_fraction_boost=1.0
-num_sent_gen=12000000
-num_prons=1000000
-
 minimize=true
 
 proxy_phone_beam=-1
 proxy_phone_nbest=-1
 proxy_beam=5
 proxy_nbest=500
 
-proxy_extlex_phone_beam=-1
-proxy_extlex_phone_nbest=300
-proxy_extlex_beam=-1
-proxy_extlex_nbest=-1
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
diff --git a/egs/babel/s5b/conf/common.limitedLP b/egs/babel/s5b/conf/common.limitedLP
@@ -56,7 +56,7 @@ if [[ `hostname` == *.tacc.utexas.edu ]] ; then
 
   dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
                          --parallel-opts "-pe smp 16" )
-  dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1
                          --parallel-opts "-pe smp 16" )
 
   dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1)
@@ -70,9 +70,9 @@ else
   sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=1.5G,ram_free=1.5G")
   dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
 
-  dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
                          --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
-  dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1 \
                          --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
   dnn_parallel_opts="-l gpu=1"
   dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1 \
@@ -91,19 +91,14 @@ wip=0.5
 
 phoneme_mapping=
 
-extend_lexicon=true
-unk_fraction_boost=1.0
-num_sent_gen=12000000
-num_prons=1000000
-
 minimize=true
 
 proxy_phone_beam=-1
 proxy_phone_nbest=-1
 proxy_beam=5
 proxy_nbest=500
 
-proxy_extlex_phone_beam=-1
-proxy_extlex_phone_nbest=300
-proxy_extlex_beam=-1
-proxy_extlex_nbest=-1
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
diff --git a/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.official.conf b/egs/babel/s5b/conf/lang/107-vietnamese-limitedLP.official.conf
@@ -47,7 +47,7 @@ eval_nj=64
 #Shadow data files 
 shadow_data_dir=(
                 /export/babel/data/107-vietnamese/release-current/conversational/dev/
-                /export/babel/data/IARPA-BABEL_OP1_dev_eval/BABEL_OP1_201/conversational/eval
+                /export/babel/data/107-vietnamese/release-current/conversational/eval/
               )
 shadow_data_cmudb=/export/babel/data/splits/Vietnamese_Babel107/uem/conv-eval/db-v8-utt.dat
 shadow_data_list=(

diff --git a/egs/babel/s5b/local/augment_original_stm.pl b/egs/babel/s5b/local/augment_original_stm.pl
@@ -53,7 +53,7 @@
   chop;
   (my $filename, my $line, my $aggregated, my $seg_start, my $seg_end, my $text) = split(/\s+/, $_, 6);
   #print "$filename, $seg_start, $seg_end, $text\n";
-  $line="A";
+  $line="1";
   if (( $prev_filename ne  $filename ) && ( ";;$prev_filename" ne  $filename)){
     my $_filename = $filename;
     $_filename =~ s/^;;//g;