Skip to content

Commit

Permalink
Adjust unicode handling for python2
Browse files Browse the repository at this point in the history
  • Loading branch information
nikitakit committed Dec 31, 2018
1 parent 0f4439f commit 5fb30eb
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 38 deletions.
67 changes: 36 additions & 31 deletions benepar/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,42 @@

IS_PY2 = sys.version_info < (3,0)

if IS_PY2:
STRING_TYPES = (str, unicode)
else:
STRING_TYPES = (str,)

ELMO_START_SENTENCE = 256
ELMO_STOP_SENTENCE = 257
ELMO_START_WORD = 258
ELMO_STOP_WORD = 259
ELMO_CHAR_PAD = 260

PTB_TOKEN_ESCAPE = {"(": "-LRB-",
")": "-RRB-",
"{": "-LCB-",
"}": "-RCB-",
"[": "-LSB-",
"]": "-RSB-"}

BERT_TOKEN_MAPPING = {"-LRB-": "(",
"-RRB-": ")",
"-LCB-": "{",
"-RCB-": "}",
"-LSB-": "[",
"-RSB-": "]",
"``": '"',
"''": '"',
"`": "'",
"“": '"',
"”": '"',
"‘": "'",
"’": "'",
": '"',
": '"',
"„": '"',
"‹": "'",
"›": "'",
PTB_TOKEN_ESCAPE = {u"(": u"-LRB-",
u")": u"-RRB-",
u"{": u"-LCB-",
u"}": u"-RCB-",
u"[": u"-LSB-",
u"]": u"-RSB-"}

BERT_TOKEN_MAPPING = {u"-LRB-": u"(",
u"-RRB-": u")",
u"-LCB-": u"{",
u"-RCB-": u"}",
u"-LSB-": u"[",
u"-RSB-": u"]",
u"``": u'"',
u"''": u'"',
u"`": u"'",
u"\u201c": u'"',
u"\u201d": u'"',
u"\u2018": u"'",
u"\u2019": u"'",
u"\xab": u'"',
u"\xbb": u'"',
u"\u201e": u'"',
u"\u2039": u"'",
u"\u203a": u"'",
}

# Label vocab is made immutable because it is potentially exposed to users
Expand Down Expand Up @@ -171,7 +176,7 @@ def __init__(self, name, batch_size=64):
self._graph = tf.Graph()

with self._graph.as_default():
if isinstance(name, str) and '/' not in name:
if isinstance(name, STRING_TYPES) and '/' not in name:
model = load_model(name)
elif not os.path.exists(name):
raise Exception("Argument is neither a valid module name nor a path to an existing file/folder: {}".format(name))
Expand Down Expand Up @@ -267,17 +272,17 @@ def _make_feed_dict_bert(self, sentences):
tokens = []
word_end_mask = []

tokens.append("[CLS]")
tokens.append(u"[CLS]")
word_end_mask.append(1)

cleaned_words = []
for word in sentence:
word = BERT_TOKEN_MAPPING.get(word, word)
# BERT is pre-trained with a tokenizer that doesn't split off
# n't as its own token
if word == "n't" and cleaned_words:
cleaned_words[-1] = cleaned_words[-1] + "n"
word = "'t"
if word == u"n't" and cleaned_words:
cleaned_words[-1] = cleaned_words[-1] + u"n"
word = u"'t"
cleaned_words.append(word)

for word in cleaned_words:
Expand All @@ -286,7 +291,7 @@ def _make_feed_dict_bert(self, sentences):
word_end_mask.append(0)
word_end_mask[-1] = 1
tokens.extend(word_tokens)
tokens.append("[SEP]")
tokens.append(u"[SEP]")
word_end_mask.append(1)

input_ids = self._bert_tokenizer.convert_tokens_to_ids(tokens)
Expand Down
12 changes: 9 additions & 3 deletions benepar/nltk_plugin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import nltk
from nltk import Tree

from .base_parser import BaseParser, PTB_TOKEN_ESCAPE
from .base_parser import BaseParser, IS_PY2, STRING_TYPES, PTB_TOKEN_ESCAPE

TOKENIZER_LOOKUP = {
'en': 'english',
Expand Down Expand Up @@ -80,14 +80,20 @@ def make_tree():

def _nltk_process_sents(self, sents):
for sentence in sents:
if isinstance(sentence, str):
if isinstance(sentence, STRING_TYPES):
if self._tokenizer_lang is None:
raise ValueError(
"No word tokenizer available for this language. "
"Please tokenize before calling the parser."
)
sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

if IS_PY2:
sentence = [
word.encode('utf-8', 'ignore') if isinstance(word, str) else word
for word in sentence
]

if not self._provides_tags:
sentence = nltk.pos_tag(sentence)
yield [word for word, tag in sentence], sentence
Expand Down Expand Up @@ -119,7 +125,7 @@ def parse_sents(self, sents):
Returns: Iter[nltk.Tree]
"""
if isinstance(sents, str):
if isinstance(sents, STRING_TYPES):
if self._tokenizer_lang is None:
raise ValueError(
"No tokenizer available for this language. "
Expand Down
8 changes: 4 additions & 4 deletions benepar/spacy_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class BeneparComponent(BaseParser):
Sample usage:
>>> nlp = spacy.load('en')
>>> nlp.add_pipe(BeneparComponent("benepar_en"))
>>> doc = nlp("The quick brown fox jumps over the lazy dog.")
>>> doc = nlp(u"The quick brown fox jumps over the lazy dog.")
>>> sent = list(doc.sents)[0]
>>> print(sent._.parse_string)
Expand Down Expand Up @@ -147,18 +147,18 @@ def make_str():
label = label_vocab[label_idx]
if (i + 1) >= j:
token = doc[i]
s = "({} {})".format(token.tag_, PTB_TOKEN_ESCAPE.get(token.text, token.text))
s = u"({} {})".format(token.tag_, PTB_TOKEN_ESCAPE.get(token.text, token.text))
else:
children = []
while ((idx_cell[0] + 1) < len(constituent_data.starts)
and i <= constituent_data.starts[idx_cell[0] + 1]
and constituent_data.ends[idx_cell[0] + 1] <= j):
children.append(make_str())

s = " ".join(children)
s = u" ".join(children)

for sublabel in reversed(label):
s = "({} {})".format(sublabel, s)
s = u"({} {})".format(sublabel, s)
return s

return make_str()
Expand Down

0 comments on commit 5fb30eb

Please sign in to comment.