integrate EN pipeline, controlled with language id switches

lijp22 · Aug 19, 2022 · 979df4c · 979df4c
1 parent 65b2d6f
commit 979df4c
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 103 deletions.
diff --git a/utils/asr-score_v2 → utils/asr-score_en b/utils/asr-score_v2 → utils/asr-score_en
diff --git a/utils/asr-score → utils/asr-score_zh b/utils/asr-score → utils/asr-score_zh
diff --git a/utils/benchmark.sh b/utils/benchmark.sh
@@ -75,8 +75,8 @@ for x in $test_sets; do
             echo "$0: Error, unsupported language code ${language}"
             exit -1
         fi
-        
-        ${LEADERBOARD}/utils/asr-score \
+
+        ${LEADERBOARD}/utils/asr-score_${language} \
             --tokenizer ${tokenizer} \
             --ref $dir/ref.txt \
             --hyp $dir/rec_non_empty.txt \

diff --git a/utils/fillers.tsv → utils/interjections_en.csv b/utils/fillers.tsv → utils/interjections_en.csv
diff --git a/utils/textnorm_en.py b/utils/textnorm_en.py
@@ -3,6 +3,29 @@
 import sys, os, codecs
 import argparse
 import string
+from nemo_text_processing.text_normalization.normalize import Normalizer
+
+
+class interj:
+    def __init__(self):
+        self.inj_list = []
+        f = open(
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                'interjections_en.csv'
+            )
+        )
+        words = f.readlines()
+        for word in words:
+            word = word.strip()
+            self.inj_list.append(' ' + word + ' ')
+            self.inj_list.append(' ' + word.upper() + ' ')
+
+    def interjection(self, line):
+        for item in self.inj_list:
+            line = line.replace(item, ' ')
+        return line
+
 
 if __name__ == '__main__':
     p = argparse.ArgumentParser()
@@ -16,7 +39,8 @@
 
     ifile = codecs.open(args.ifile, 'r', 'utf8')
     ofile = codecs.open(args.ofile, 'w+', 'utf8')
-
+    nemo_tn = Normalizer(input_case='cased', lang='en')
+    ii = interj()
     n = 0
     for l in ifile:
         key = ''
@@ -40,24 +64,37 @@
         if args.to_lower:
             text = text.lower()
 
+        # remove space before and after signs
+        text = text.replace(' / ', '/')
+        text = text.replace(' - ', '-')
+
+
+        # text normalization
+        text = nemo_tn.normalize(text)
 
         # Punctuations removal
-        old_chars = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~' # string.punctuation except ' (e.g. in I'm, that's)
+        old_chars = '!"#%&()*/+,.:;<=>?@[]^_`{|}~'  # string.punctuation except ' (e.g. in I'm, that's)
         new_chars = ' ' * len(old_chars)
         del_chars = ''
         text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
 
-        # 
+        # remove interjection
+        text = ' ' + text + ' '
+        for i in range(3):
+            text = ii.interjection(text)
+        text = text.strip()
+        text = text.upper()
+
         if args.has_key:
             ofile.write(key + '\t' + text + '\n')
         else:
             ofile.write(text + '\n')
-        
+
         n += 1
         if n % args.log_interval == 0:
             sys.stderr.write("text norm: {} lines done.\n".format(n))
-    
+
     sys.stderr.write("text norm: {} lines done in total.\n".format(n))
-    
+
     ifile.close()
     ofile.close()
diff --git a/utils/textnorm_en_v2.py b/utils/textnorm_en_v2.py