Adjust unicode handling for python2

xinyiyang7 · Dec 31, 2018 · 5fb30eb · 5fb30eb
1 parent 0f4439f
commit 5fb30eb
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 38 deletions.
diff --git a/benepar/base_parser.py b/benepar/base_parser.py
@@ -13,37 +13,42 @@
 
 IS_PY2 = sys.version_info < (3,0)
 
+if IS_PY2:
+    STRING_TYPES = (str, unicode)
+else:
+    STRING_TYPES = (str,)
+
 ELMO_START_SENTENCE = 256
 ELMO_STOP_SENTENCE = 257
 ELMO_START_WORD = 258
 ELMO_STOP_WORD = 259
 ELMO_CHAR_PAD = 260
 
-PTB_TOKEN_ESCAPE = {"(": "-LRB-",
-    ")": "-RRB-",
-    "{": "-LCB-",
-    "}": "-RCB-",
-    "[": "-LSB-",
-    "]": "-RSB-"}
-
-BERT_TOKEN_MAPPING = {"-LRB-": "(",
-    "-RRB-": ")",
-    "-LCB-": "{",
-    "-RCB-": "}",
-    "-LSB-": "[",
-    "-RSB-": "]",
-    "``": '"',
-    "''": '"',
-    "`": "'",
-    "“": '"',
-    "”": '"',
-    "‘": "'",
-    "’": "'",
-    "«": '"',
-    "»": '"',
-    "„": '"',
-    "‹": "'",
-    "›": "'",
+PTB_TOKEN_ESCAPE = {u"(": u"-LRB-",
+    u")": u"-RRB-",
+    u"{": u"-LCB-",
+    u"}": u"-RCB-",
+    u"[": u"-LSB-",
+    u"]": u"-RSB-"}
+
+BERT_TOKEN_MAPPING = {u"-LRB-": u"(",
+    u"-RRB-": u")",
+    u"-LCB-": u"{",
+    u"-RCB-": u"}",
+    u"-LSB-": u"[",
+    u"-RSB-": u"]",
+    u"``": u'"',
+    u"''": u'"',
+    u"`": u"'",
+    u"\u201c": u'"',
+    u"\u201d": u'"',
+    u"\u2018": u"'",
+    u"\u2019": u"'",
+    u"\xab": u'"',
+    u"\xbb": u'"',
+    u"\u201e": u'"',
+    u"\u2039": u"'",
+    u"\u203a": u"'",
     }
 
 # Label vocab is made immutable because it is potentially exposed to users
@@ -171,7 +176,7 @@ def __init__(self, name, batch_size=64):
         self._graph = tf.Graph()
 
         with self._graph.as_default():
-            if isinstance(name, str) and '/' not in name:
+            if isinstance(name, STRING_TYPES) and '/' not in name:
                 model = load_model(name)
             elif not os.path.exists(name):
                 raise Exception("Argument is neither a valid module name nor a path to an existing file/folder: {}".format(name))
@@ -267,17 +272,17 @@ def _make_feed_dict_bert(self, sentences):
             tokens = []
             word_end_mask = []
 
-            tokens.append("[CLS]")
+            tokens.append(u"[CLS]")
             word_end_mask.append(1)
 
             cleaned_words = []
             for word in sentence:
                 word = BERT_TOKEN_MAPPING.get(word, word)
                 # BERT is pre-trained with a tokenizer that doesn't split off
                 # n't as its own token
-                if word == "n't" and cleaned_words:
-                    cleaned_words[-1] = cleaned_words[-1] + "n"
-                    word = "'t"
+                if word == u"n't" and cleaned_words:
+                    cleaned_words[-1] = cleaned_words[-1] + u"n"
+                    word = u"'t"
                 cleaned_words.append(word)
 
             for word in cleaned_words:
@@ -286,7 +291,7 @@ def _make_feed_dict_bert(self, sentences):
                     word_end_mask.append(0)
                 word_end_mask[-1] = 1
                 tokens.extend(word_tokens)
-            tokens.append("[SEP]")
+            tokens.append(u"[SEP]")
             word_end_mask.append(1)
 
             input_ids = self._bert_tokenizer.convert_tokens_to_ids(tokens)

diff --git a/benepar/nltk_plugin.py b/benepar/nltk_plugin.py
@@ -1,7 +1,7 @@
 import nltk
 from nltk import Tree
 
-from .base_parser import BaseParser, PTB_TOKEN_ESCAPE
+from .base_parser import BaseParser, IS_PY2, STRING_TYPES, PTB_TOKEN_ESCAPE
 
 TOKENIZER_LOOKUP = {
     'en': 'english',
@@ -80,14 +80,20 @@ def make_tree():
 
     def _nltk_process_sents(self, sents):
         for sentence in sents:
-            if isinstance(sentence, str):
+            if isinstance(sentence, STRING_TYPES):
                 if self._tokenizer_lang is None:
                     raise ValueError(
                         "No word tokenizer available for this language. "
                         "Please tokenize before calling the parser."
                         )
                 sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)
 
+            if IS_PY2:
+                sentence = [
+                    word.encode('utf-8', 'ignore') if isinstance(word, str) else word
+                    for word in sentence
+                    ]
+
             if not self._provides_tags:
                 sentence = nltk.pos_tag(sentence)
                 yield [word for word, tag in sentence], sentence
@@ -119,7 +125,7 @@ def parse_sents(self, sents):
 
         Returns: Iter[nltk.Tree]
         """
-        if isinstance(sents, str):
+        if isinstance(sents, STRING_TYPES):
             if self._tokenizer_lang is None:
                 raise ValueError(
                     "No tokenizer available for this language. "

diff --git a/benepar/spacy_plugin.py b/benepar/spacy_plugin.py
@@ -51,7 +51,7 @@ class BeneparComponent(BaseParser):
     Sample usage:
     >>> nlp = spacy.load('en')
     >>> nlp.add_pipe(BeneparComponent("benepar_en"))
-    >>> doc = nlp("The quick brown fox jumps over the lazy dog.")
+    >>> doc = nlp(u"The quick brown fox jumps over the lazy dog.")
     >>> sent = list(doc.sents)[0]
     >>> print(sent._.parse_string)
 
@@ -147,18 +147,18 @@ def make_str():
         label = label_vocab[label_idx]
         if (i + 1) >= j:
             token = doc[i]
-            s = "({} {})".format(token.tag_, PTB_TOKEN_ESCAPE.get(token.text, token.text))
+            s = u"({} {})".format(token.tag_, PTB_TOKEN_ESCAPE.get(token.text, token.text))
         else:
             children = []
             while ((idx_cell[0] + 1) < len(constituent_data.starts)
                 and i <= constituent_data.starts[idx_cell[0] + 1]
                 and constituent_data.ends[idx_cell[0] + 1] <= j):
                 children.append(make_str())
 
-            s = " ".join(children)
+            s = u" ".join(children)
 
         for sublabel in reversed(label):
-            s = "({} {})".format(sublabel, s)
+            s = u"({} {})".format(sublabel, s)
         return s
 
     return make_str()