From 98aa1d8ff7cf1ba680b10b3b332067d243850f35 Mon Sep 17 00:00:00 2001 From: DongjiGao Date: Wed, 17 Jul 2019 16:14:19 -0400 Subject: [PATCH] [egs] fixed bug in egs/gale_arabic/s5c/local/prepare_dict_subword.sh that it may delete words matching '<*>' (#3465) --- egs/gale_arabic/s5c/local/prepare_dict_subword.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh index 330de664349..e05846ec593 100755 --- a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh +++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh @@ -48,7 +48,7 @@ glossaries=" " if [ $stage -le 0 ]; then echo "$0: making subword lexicon... $(date)." # get pair_code file - cut -d ' ' -f2- data/train/text | sed 's/<[^>]*>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt + cut -d ' ' -f2- data/train/text | sed 's///g;s///g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt mv $dir/lexicon.txt $dir/lexicon_word.txt # get words cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt