diff --git a/MODELS.md b/MODELS.md index d3c1b47a..4a797b5e 100644 --- a/MODELS.md +++ b/MODELS.md @@ -1,6 +1,6 @@ # 模型下载 -| 模型 | 大小 | sha256 | +| 模型 | 大小 | SHA256 | | :-------: | :---------------------------------------------: | :--------------------------------------------------------------: | | Base(v3) | [491.9MB](http://39.96.43.154/ltp/v3/base.tgz) | 777a97d6770285e5ab3b0720923bc86781e3279508a72a30c2dd9140b09e5ec8 | | Small(v3) | [156.8MB](http://39.96.43.154/ltp/v3/small.tgz) | 0992d5037cd1c62779a3b5c6d45b883a46e4782c6bcc5850117faf69a9ee6c56 | @@ -39,4 +39,17 @@ + Centos 3.10.0-1062.9.1.el7.x86_64 + Intel(R) Xeon(R) CPU E5-2640 v4 @ 2.40GHz -**备注**: 速度数据在人民日报命名实体测试数据上获得,速度计算方式均为所有任务顺序执行的结果。另外,语义角色标注与语义依存新旧版采用的语料不相同,因此无法直接比较(新版语义依存使用Semeval 2016语料,语义角色标注使用CPB3.0语料)。 +**备注**: 速度数据在人民日报命名实体测试数据上获得,速度计算方式均为所有任务顺序执行的结果。另外,语义角色标注与语义依存新旧版采用的语料不相同,因此无法直接比较(新版语义依存使用Semeval +2016语料,语义角色标注使用CPB3.0语料)。 + +# 其他模型 + +## [Universal Dependencies](https://universaldependencies.org/) + +| 模型 | 分词 | 词性(XPOS) | 命名实体 | 依存句法 | SHA256 | +| :--------------------------------------------------------------------------: | :---: | :--------: | :------: | :------: | :--------------------------------------------------------------: | +| GSD + OntoNotes ([GSD](http://39.96.43.154/ltp/ud/gsd.tgz)) | 98.4 | 96.85 | 78.56 | 87.24 | e4fd41c6f2c6d84d6df2657f1e47078cb98364366d91e852f0980102c755592a | +| GSD + OntoNotes ([GSD+CRF](http://39.96.43.154/ltp/ud/gsd_crf.tgz)) | 98.26 | 96.38 | 79.77 | 86.57 | 0264b4a92e34bb97054ff06f99068b884c54908d1ad265926b0983f2594e1e6a | +| GSDSimp + OntoNotes ([GSDSimp](http://39.96.43.154/ltp/ud/gsds.tgz)) | 98.44 | 96.84 | 78.06 | 87.58 | d51b8508f290ab82d1c3844541eb774506213c1f6cf7d2b86fe7d69358d0d52a | +| GSDSimp + OntoNotes ([GSDSimp+CRF](http://39.96.43.154/ltp/ud/gsds_crf.tgz)) | 98.4 | 96.47 | 79.69 | 86.39 | 1e9ea20cfbc1837bf5736f8b8502aaecb9343590e98b04bba18e15724d3092b2 | + diff --git a/ltp/algorithms/maximum_forward_matching.py b/ltp/algorithms/maximum_forward_matching.py index 0a72609f..69509c89 100644 --- a/ltp/algorithms/maximum_forward_matching.py +++ b/ltp/algorithms/maximum_forward_matching.py @@ -88,5 +88,7 @@ def maximum_forward_matching(self, text: List[str], preffix: List[bool]): maximum_matching_pos.append(candidate) else: start += 1 + else: + start += 1 return maximum_matching_pos diff --git a/ltp/frontend.py b/ltp/frontend.py index 82df7fe3..eb8a53c7 100644 --- a/ltp/frontend.py +++ b/ltp/frontend.py @@ -36,7 +36,11 @@ model_map = { 'base': 'http://39.96.43.154/ltp/v3/base.tgz', 'small': 'http://39.96.43.154/ltp/v3/small.tgz', - 'tiny': 'http://39.96.43.154/ltp/v3/tiny.tgz' + 'tiny': 'http://39.96.43.154/ltp/v3/tiny.tgz', + 'GSD': 'http://39.96.43.154/ltp/ud/gsd.tgz', + 'GSD+CRF': 'http://39.96.43.154/ltp/ud/gsd_crf.tgz', + 'GSDSimp': 'http://39.96.43.154/ltp/ud/gsd.tgz', + 'GSDSimp+CRF': 'http://39.96.43.154/ltp/ud/gsd_crf.tgz', } @@ -129,20 +133,22 @@ def __init__(self, path: str = 'small', device=None, **kwargs): patch_4_1_3(ckpt) self.cache_dir = path - config = AutoConfig.for_model(**ckpt['transformer_config']) + transformer_config = ckpt['transformer_config'] + transformer_config['torchscript'] = True + config = AutoConfig.for_model(**transformer_config) self.model = Model(ckpt['model_config'], config=config).to(self.device) self.model.load_state_dict(ckpt['model'], strict=False) self.model.eval() - self.max_length = self.model.transformer.config.max_position_embeddings + self.seg_vocab = ckpt.get('seg', [WORD_MIDDLE, WORD_START]) self.pos_vocab = ckpt.get('pos', []) self.ner_vocab = ckpt.get('ner', []) self.dep_vocab = ckpt.get('dep', []) self.sdp_vocab = ckpt.get('sdp', []) self.srl_vocab = [re.sub(r'ARG(\d)', r'A\1', tag.lstrip('ARGM-')) for tag in ckpt.get('srl', [])] - self.model_version = ckpt.get('version', 'unknown') self.tokenizer = AutoTokenizer.from_pretrained(path, config=self.model.transformer.config, use_fast=True) self.trie = Trie() + self._model_version = ckpt.get('version', None) def __str__(self): return f"LTP {self.version} on {self.device} (model version: {self.model_version}) " @@ -151,7 +157,7 @@ def __repr__(self): return f"LTP {self.version} on {self.device} (model version: {self.model_version}) " @property - def avaliable(self): + def avaliable_models(self): return model_map.keys() @property @@ -159,6 +165,14 @@ def version(self): from ltp import __version__ as version return version + @property + def model_version(self): + return self._model_version or 'unknown' + + @property + def max_length(self): + return self.model.transformer.config.max_position_embeddings + def init_dict(self, path, max_window=None): self.trie.init(path, max_window)