Skip to content

Commit

Permalink
1. 修复了 pygtrie 报错的问题
Browse files Browse the repository at this point in the history
2. 修复了 自定义词典 不起效果的问题
  • Loading branch information
AlongWY committed Jul 21, 2020
1 parent 50e4b25 commit 02dbe1f
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 24 deletions.
11 changes: 6 additions & 5 deletions ltp/ltp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import regex as re
from typing import List

from transformers import AutoTokenizer, cached_path, TensorType
from transformers import AutoTokenizer, cached_path, TensorType, BatchEncoding
from transformers.file_utils import is_remote_url

from ltp.models import Model
Expand Down Expand Up @@ -143,11 +143,12 @@ def sent_split(inputs: List[str], flag: str = "all", limit: int = 510):
inputs = list(itertools.chain(*inputs))
return inputs

def seg_with_dict(self, inputs: List[str]):
def seg_with_dict(self, inputs: List[str], tokenized: BatchEncoding):
# 进行正向字典匹配
matching = []
for line in inputs:
matching_pos = self.trie.maximum_forward_matching(line)
for source_text, encoding in zip(inputs, tokenized.encodings):
text = [source_text[start:end] for start, end in encoding.offsets[1:-1] if end != 0]
matching_pos = self.trie.maximum_forward_matching(text)
matching.append(matching_pos)
return matching

Expand Down Expand Up @@ -177,7 +178,7 @@ def seg(self, inputs: List[str]):

# merge segments with maximum forward matching
if self.trie.is_init:
matching = self.seg_with_dict(inputs)
matching = self.seg_with_dict(inputs, tokenizerd)
for ids, seg_out in zip(matching, seg):
for ids_iter in ids:
seg_out[ids_iter[0]] = 0
Expand Down
2 changes: 1 addition & 1 deletion ltp/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def length(sequence):
USE_PLUGIN = False


def segment_decode():
def segment_decode(inputs, segment_output, offsets, words):
pass


Expand Down
29 changes: 11 additions & 18 deletions ltp/utils/ltp_trie.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*_
# Author: jeffrey
# Author: Yunlong Feng <[email protected]>

import os
from typing import List

import pygtrie


Expand Down Expand Up @@ -43,24 +46,14 @@ def add_words(self, words):
for word in words:
self[word] = True

def maximum_forward_matching(self, text: str):
def maximum_forward_matching(self, text: List[str]):
maximum_matching_pos = []
start = 0
text_len = len(text.strip())
while start < text_len:
text_len = len(text)
for start in range(text_len - 1):
candidate = None
for end in range(1, self.max_window + 1):
if start + end - 1 < text_len and self[text[start:end]]:
candidate = (start, start + end)
if end == self.max_window:
if candidate:
maximum_matching_pos.append(candidate)
start = candidate[1] - 1
break
elif start + end - 1 >= text_len:
if candidate:
maximum_matching_pos.append(candidate)
start = candidate[1] - 1
break
start = start + 1
for end in range(start + 1, min(text_len, start + self.max_window + 1)):
if self.get("".join(text[start:end]), False):
candidate = (start, end)
if candidate:
maximum_matching_pos.append(candidate)
return maximum_matching_pos

0 comments on commit 02dbe1f

Please sign in to comment.