Skip to content

Commit

Permalink
TxtBMESFormat supports non-gold input
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jan 12, 2020
1 parent 6defd79 commit 600cb6d
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
4 changes: 3 additions & 1 deletion hanlp/transform/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bo
return isinstance(input, str)

def inputs_to_samples(self, inputs, gold=False):
for chars, tags in inputs:
for chars, tags in (inputs if gold else zip(inputs, [None] * len(inputs))):
if not gold:
tags = [self.tag_vocab.safe_pad_token] * len(chars)
chars = CharTable.normalize_chars(chars)
yield chars, tags
7 changes: 4 additions & 3 deletions tests/demo/zh/demo_cws.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ def split_by_dic(text: str):
p = re.compile('(' + '|'.join(dic.keys()) + ')')
sents, offset, words = [], 0, []
for m in p.finditer(text):
sents.append(text[offset: m.start()])
words.append((m.group(), dic[m.group()]))
offset = m.end()
if offset < m.start():
sents.append(text[offset: m.start()])
words.append((m.group(), dic[m.group()]))
offset = m.end()
if offset < len(text):
sents.append(text[offset:])
words.append((None, None))
Expand Down

0 comments on commit 600cb6d

Please sign in to comment.