Skip to content

Commit

Permalink
TN bug fix (NVIDIA#3538)
Browse files Browse the repository at this point in the history
* ve and cel fixes

Signed-off-by: ekmb <[email protected]>

* ve and cel fixes

Signed-off-by: ekmb <[email protected]>

* add w to single digit roman and cardinal single digit graph (non det)

Signed-off-by: ekmb <[email protected]>

* isn't fix

Signed-off-by: ekmb <[email protected]>

Co-authored-by: Yang Zhang <[email protected]>
Co-authored-by: Eric Harper <[email protected]>
  • Loading branch information
3 people authored Jan 28, 2022
1 parent cca8991 commit 5484652
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 14 deletions.
8 changes: 4 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ pipeline {
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
}
}
stage('German ITN and non-deterministic TN') {
Expand All @@ -131,8 +131,8 @@ pipeline {
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
}
}
stage('Run Ru ITN and non-deterministic TN & Run all Ru ITN tests') {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
f fahrenheit
°f fahrenheit
fahrenheit
c celsius
°c celsius
celsius
km kilometer
Expand Down
5 changes: 0 additions & 5 deletions nemo_text_processing/text_normalization/en/data/whitelist.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -6806,7 +6806,6 @@ ISL i s l
ISMF i s m f
ISMIR i s m i r
ISMNs i s m n's
isn i s n
ISN i s n
ISNTUC i s n t u c
ISOGG i s o g g
Expand Down Expand Up @@ -13105,10 +13104,6 @@ V.D. v d
VD v d
VDV v d v
V. D. W. v d w
ve v e
Ve v e
V. E. v e
VE v e
vez v e z
Vez v e z
VEZ v e z
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,13 @@ def __init__(self, deterministic: bool = True):
leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)
)

# add small weight to non-default graphs to make sure the deterministic option is listed first
final_graph = (
self.graph
| serial_graph
| self.range_graph
| self.single_digits_graph
| get_hundreds_graph()
| pynutil.add_weight(self.single_digits_graph, 0.001)
| pynutil.add_weight(get_hundreds_graph(), 0.001)
| pynutil.add_weight(single_digits_graph_with_commas, 0.001)
| cardinal_with_leading_zeros
)
Expand Down
9 changes: 7 additions & 2 deletions nemo_text_processing/text_normalization/en/taggers/roman.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.


from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, insert_space
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels

Expand Down Expand Up @@ -56,7 +56,12 @@ def _load_roman(file: str):
| (hundreds + pynini.closure(insert_space + ties, 0, 1) + pynini.closure(insert_space + digit_teen, 0, 1))
).optimize()

graph = graph + pynini.closure(pynutil.delete("."), 0, 1)
# add a higher weight when roman number consists of a single symbol
graph = pynini.compose(pynini.closure(NEMO_CHAR, 2), graph) | pynutil.add_weight(
pynini.compose(NEMO_CHAR, graph), 101
)
graph = graph.optimize() + pynini.closure(pynutil.delete("."), 0, 1)

graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
graph = self.add_tokens(graph)
self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ abbreviation USA,~abbreviation u s a,
April 29th’s meeting~april twenty ninth’s meeting
?,~?,
?,no~?,no
I've 20' and 14/ they're I'm 16c.~I've twenty' and fourteen/ they're I'm one six c.

0 comments on commit 5484652

Please sign in to comment.