Skip to content

Commit

Permalink
fixes to multi-gpu training (code and scripts)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yajie Miao committed Oct 13, 2015
1 parent 154ff22 commit e37faeb
Show file tree
Hide file tree
Showing 7 changed files with 1,213 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
.depend.mk

# /src/
/src/kaldi.mk
/src/config.mk
2 changes: 1 addition & 1 deletion asr_egs/swbd/v1/path.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export KALDI_ROOT=`pwd`/../../..
export PATH=$PWD/utils/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/featbin:$KALDI_ROOT/src/decoderbin:$KALDI_ROOT/src/fstbin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH
export PATH=$PWD/utils/:$KALDI_ROOT/src/netbin:$KALDI_ROOT/src/featbin:$KALDI_ROOT/src/decoderbin:$KALDI_ROOT/src/fstbin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH
export LC_ALL=C

3 changes: 3 additions & 0 deletions asr_egs/wsj/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@
export train_cmd=run.pl
export decode_cmd=run.pl
export cuda_cmd=run.pl
# Comet cluster
#export cuda_cmd="slurm_comet.pl -p gpu-shared -t 48:00:00 --gpu 1"

12 changes: 6 additions & 6 deletions asr_egs/wsj/steps/train_ctc_parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ dir=$3
mkdir -p $dir/log $dir/nnet

for f in $data_tr/feats.scp $data_cv/feats.scp $dir/labels.tr.gz $dir/labels.cv.gz $dir/nnet.proto; do
[ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1;
[ ! -f $f ] && echo "train_ctc_parallel.sh: no such file $f" && exit 1;
done

## Read the training status for resuming
Expand Down Expand Up @@ -109,8 +109,8 @@ feats_cv="ark,s,cs:copy-feats scp:$dir/cv_local.scp ark:- |"
[ $clean_up == true ] && trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT

if [ ! -z "$nj" ]; then
cat $dir/train_local.scp | myutils/distribute_scp.pl --mode utt $nj $dir/train_split
cat $dir/cv_local.scp | myutils/distribute_scp.pl --mode utt $nj $dir/cv_split
cat $dir/train_local.scp | utils/distribute_scp.pl --mode utt $nj $dir/train_split
cat $dir/cv_local.scp | utils/distribute_scp.pl --mode utt $nj $dir/cv_split
feats_sub_tr="ark,s,cs:copy-feats scp:$dir/train_split.JOB.scp ark:- |"
feats_sub_cv="ark,s,cs:copy-feats scp:$dir/cv_split.JOB.scp ark:- |"
fi
Expand Down Expand Up @@ -154,12 +154,12 @@ for iter in $(seq $start_epoch_num $max_iters); do
>& $dir/log/tr.iter$iter.log || exit 1;
tracc=$(cat $dir/log/tr.iter${iter}.log | grep "TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$3; gsub("%","",acc); print acc; }')
else
$cudall_cmd JOB=1:$nj $dir/log/tr.iter$iter.JOB.log \
$cuda_cmd JOB=1:$nj $dir/log/tr.iter$iter.JOB.log \
$train_tool --report-step=$report_step --num-sequence=$num_sequence --frame-limit=$frame_num_limit \
--learn-rate=$learn_rate --momentum=$momentum --num-jobs=$nj --job-id=JOB \
--verbose=$verbose \
${utts_per_avg:+ --utts-per-avg=$utts_per_avg} \
"$feats_sub_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} >& $dir/log/tr.iter$iter.log || exit 1
"$feats_sub_tr" "$labels_tr" $dir/nnet/nnet.iter$[iter-1] $dir/nnet/nnet.iter${iter} || exit 1
tracc=$(cat $dir/log/tr.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }')
fi

Expand All @@ -183,7 +183,7 @@ for iter in $(seq $start_epoch_num $max_iters); do
--cross-validate=true --num-jobs=$nj --job-id=JOB \
--learn-rate=$learn_rate \
--verbose=$verbose \
"$feats_sub_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} >& $dir/log/cv.iter$iter.log || exit 1;
"$feats_sub_cv" "$labels_cv" $dir/nnet/nnet.iter${iter} || exit 1;
cvacc=$(cat $dir/log/cv.iter${iter}.1.log | grep "TOTAL TOKEN_ACCURACY" | tail -n 1 | awk '{ acc=$(NF-1); gsub("%","",acc); print acc; }')
fi

Expand Down
Loading

0 comments on commit e37faeb

Please sign in to comment.