Skip to content

Commit

Permalink
per language tokenize method
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl authored and forslund committed Dec 13, 2019
1 parent 608a46b commit d1b1c59
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
18 changes: 11 additions & 7 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class Normalizer:
def __init__(self, config=None):
self.config = config or self._default_config

@staticmethod
def tokenize(utterance):
return utterance.split()

@property
def should_lowercase(self):
return self.config.get("lowercase", False)
Expand Down Expand Up @@ -94,37 +98,37 @@ def articles(self):
@property
def symbols(self):
return self.config.get("symbols",
[",", ";", "-", "_", "!", "?", ".", ":", "<", ">",
[";", "_", "!", "?", "<", ">",
"|", "(", ")", "=", "[", "]", "{",
"}", "»", "«", "*", "+", "~", "^", "'", "`"])
"}", "»", "«", "*", "~", "^", "`"])

def expand_contractions(self, utterance):
""" Expand common contractions, e.g. "isn't" -> "is not" """
words = utterance.split()
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.contractions:
words[idx] = self.contractions[w]
utterance = " ".join(words)
return utterance

def numbers_to_digits(self, utterance):
words = utterance.split()
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.number_replacements:
words[idx] = self.number_replacements[w]
utterance = " ".join(words)
return utterance

def remove_articles(self, utterance):
words = utterance.split()
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.articles:
words[idx] = ""
utterance = " ".join(words)
return utterance

def remove_stopwords(self, utterance):
words = utterance.split()
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.stopwords:
words[idx] = ""
Expand All @@ -142,7 +146,7 @@ def remove_accents(self, utterance):
return utterance

def replace_words(self, utterance):
words = utterance.split()
words = self.tokenize(utterance)
for idx, w in enumerate(words):
if w in self.word_replacements:
words[idx] = self.word_replacements[w]
Expand Down
7 changes: 7 additions & 0 deletions lingua_franca/lang/parse_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ class PortugueseNormalizer(Normalizer):
with open(resolve_resource_file("text/pt-pt/normalize.json")) as f:
_default_config = json.load(f)

@staticmethod
def tokenize(utterance):
tokens = []
for w in utterance.split():
tokens += w.split("-")
return tokens


def normalize_pt(text, remove_articles):
""" PT string normalization """
Expand Down

0 comments on commit d1b1c59

Please sign in to comment.