code changes for 30ms training and smbr decoding

ghedlund · May 12, 2016 · bc80076 · bc80076
1 parent 2a6f224
commit bc80076
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 9 deletions.
diff --git a/asr_egs/tedlium/v1/local/score_sclite.sh b/asr_egs/tedlium/v1/local/score_sclite.sh
@@ -46,7 +46,7 @@ mkdir -p $dir/scoring/log
 
 # We are not using lattice-align-words, which may result in minor degradation 
 if [ $stage -le 0 ]; then
-if false; then
+if true; then
   # This leads to slightly lower WERs on some tasks
   $cmd ACWT=$min_acwt:$max_acwt $dir/scoring/log/get_ctm.ACWT.log \
     mkdir -p $dir/score_ACWT/ '&&' \

diff --git a/asr_egs/wsj/steps/decode_ctc_lat.sh b/asr_egs/wsj/steps/decode_ctc_lat.sh
@@ -17,6 +17,7 @@ max_active=7000 # max-active
 beam=15.0       # beam used
 lattice_beam=8.0
 max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
+model=final.nnet
 
 skip_scoring=false # whether to skip WER scoring
 scoring_opts="--min-acwt 5 --max-acwt 10 --acwt-factor 0.1"
@@ -79,7 +80,7 @@ $subsample_feats && feats="$feats subsample-feats --n=3 --offset=0 ark:- ark:- |
 
 # Decode for each of the acoustic scales
 $cmd JOB=1:$nj $dir/log/decode.JOB.log \
-  net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true $srcdir/final.nnet "$feats" ark:- \| \
+  net-output-extract --class-frame-counts=$srcdir/label.counts --apply-log=true $srcdir/$model "$feats" ark:- \| \
   latgen-faster  --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$lattice_beam \
   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
   $graphdir/TLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;

diff --git a/asr_egs/wsj/utils/model_topo.py b/asr_egs/wsj/utils/model_topo.py
@@ -54,6 +54,12 @@ def parse_arguments(arg_elements):
     --fgate-bias-init : float
         Initial value of the forget-gate bias. Not specifying this option means the forget-gate bias
         will be initialized randomly, in the same way as the other parameters. 
+    --input-dim : int
+        Reduce the input feature to a given dimensionality before passing to the LSTM.
+        Optional.
+    --projection-dim : int
+        Project the feature vector down to a given dimensionality between LSTM layers.
+        Optional.
 
     """
 
@@ -74,26 +80,48 @@ def parse_arguments(arg_elements):
     if arguments.has_key('param_range'):
         param_range = arguments['param_range']
 
+    actual_cell_dim = 2*lstm_cell_dim
     model_type = '<BiLstmParallel>'   # by default
     if arguments.has_key('lstm_type') and arguments['lstm_type'] == 'uni':
+        actual_cell_dim = lstm_cell_dim
         model_type = '<LstmParallel>'
 
-    print '<Nnet>'
-    lstm_comm = ' <ParamRange> ' + param_range + ' <LearnRateCoef> 1.0 <MaxGrad> 50.0'
-
     # add the option to set the initial value of the forget-gate bias
+    lstm_comm = ' <ParamRange> ' + param_range + ' <LearnRateCoef> 1.0 <MaxGrad> 50.0'
     if arguments.has_key('fgate_bias_init'):
         lstm_comm = lstm_comm + ' <FgateBias> ' + arguments['fgate_bias_init']
 
-    actual_cell_dim = 2*lstm_cell_dim
-    if model_type == '<LstmParallel>':
-        actual_cell_dim = lstm_cell_dim 
+    # add the option to specify projection layers
+    if arguments.has_key('projection_dim'):
+        proj_dim = arguments['projection_dim']
+    else:
+        proj_dim = 0
+
+    # add the option to reduce the dimensionality of the input features
+    if arguments.has_key('input_dim'):
+        input_dim = arguments['input_dim']
+    else:
+        input_dim = 0
+
+
+    # pre-amble
+    print '<Nnet>'
+
+    # optional dimensionality reduction layer
+    if input_dim > 0:
+        print '<AffineTransform> <InputDim> ' + str(input_feat_dim) + ' <OutputDim> ' + str(input_dim) + ' <ParamRange> ' + param_range
+        input_feat_dim = input_dim
 
     # the first layer takes input features
     print model_type + ' <InputDim> ' + str(input_feat_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
     # the following bidirectional LSTM layers
     for n in range(1, lstm_layer_num):
-         print model_type + ' <InputDim> ' + str(actual_cell_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
+        if proj_dim > 0:
+            print '<AffineTransform> <InputDim> ' + str(actual_cell_dim) + ' <OutputDim> ' + str(proj_dim) + ' <ParamRange> ' + param_range
+            print model_type + ' <InputDim> ' +        str(proj_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
+        else:
+            print model_type + ' <InputDim> ' + str(actual_cell_dim) + ' <CellDim> ' + str(actual_cell_dim) + lstm_comm
+
     # the final affine-transform and softmax layer
     print '<AffineTransform> <InputDim> ' + str(actual_cell_dim) + ' <OutputDim> ' + str(target_num) + ' <ParamRange> ' + param_range
     print '<Softmax> <InputDim> ' + str(target_num) + ' <OutputDim> ' + str(target_num)