Skip to content

Commit

Permalink
1. 修复前向匹配循环 HIT-SCIR#457
Browse files Browse the repository at this point in the history
2. 更新4个UD模型
  • Loading branch information
AlongWY committed Dec 28, 2020
1 parent 169a016 commit 0ddd045
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 7 deletions.
17 changes: 15 additions & 2 deletions MODELS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# 模型下载

| 模型 | 大小 | sha256 |
| 模型 | 大小 | SHA256 |
| :-------: | :---------------------------------------------: | :--------------------------------------------------------------: |
| Base(v3) | [491.9MB](http://39.96.43.154/ltp/v3/base.tgz) | 777a97d6770285e5ab3b0720923bc86781e3279508a72a30c2dd9140b09e5ec8 |
| Small(v3) | [156.8MB](http://39.96.43.154/ltp/v3/small.tgz) | 0992d5037cd1c62779a3b5c6d45b883a46e4782c6bcc5850117faf69a9ee6c56 |
Expand Down Expand Up @@ -39,4 +39,17 @@
+ Centos 3.10.0-1062.9.1.el7.x86_64
+ Intel(R) Xeon(R) CPU E5-2640 v4 @ 2.40GHz

**备注**: 速度数据在人民日报命名实体测试数据上获得,速度计算方式均为所有任务顺序执行的结果。另外,语义角色标注与语义依存新旧版采用的语料不相同,因此无法直接比较(新版语义依存使用Semeval 2016语料,语义角色标注使用CPB3.0语料)。
**备注**: 速度数据在人民日报命名实体测试数据上获得,速度计算方式均为所有任务顺序执行的结果。另外,语义角色标注与语义依存新旧版采用的语料不相同,因此无法直接比较(新版语义依存使用Semeval
2016语料,语义角色标注使用CPB3.0语料)。

# 其他模型

## [Universal Dependencies](https://universaldependencies.org/)

| 模型 | 分词 | 词性(XPOS) | 命名实体 | 依存句法 | SHA256 |
| :--------------------------------------------------------------------------: | :---: | :--------: | :------: | :------: | :--------------------------------------------------------------: |
| GSD + OntoNotes ([GSD](http://39.96.43.154/ltp/ud/gsd.tgz)) | 98.4 | 96.85 | 78.56 | 87.24 | e4fd41c6f2c6d84d6df2657f1e47078cb98364366d91e852f0980102c755592a |
| GSD + OntoNotes ([GSD+CRF](http://39.96.43.154/ltp/ud/gsd_crf.tgz)) | 98.26 | 96.38 | 79.77 | 86.57 | 0264b4a92e34bb97054ff06f99068b884c54908d1ad265926b0983f2594e1e6a |
| GSDSimp + OntoNotes ([GSDSimp](http://39.96.43.154/ltp/ud/gsds.tgz)) | 98.44 | 96.84 | 78.06 | 87.58 | d51b8508f290ab82d1c3844541eb774506213c1f6cf7d2b86fe7d69358d0d52a |
| GSDSimp + OntoNotes ([GSDSimp+CRF](http://39.96.43.154/ltp/ud/gsds_crf.tgz)) | 98.4 | 96.47 | 79.69 | 86.39 | 1e9ea20cfbc1837bf5736f8b8502aaecb9343590e98b04bba18e15724d3092b2 |

2 changes: 2 additions & 0 deletions ltp/algorithms/maximum_forward_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,7 @@ def maximum_forward_matching(self, text: List[str], preffix: List[bool]):
maximum_matching_pos.append(candidate)
else:
start += 1
else:
start += 1

return maximum_matching_pos
24 changes: 19 additions & 5 deletions ltp/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@
model_map = {
'base': 'http://39.96.43.154/ltp/v3/base.tgz',
'small': 'http://39.96.43.154/ltp/v3/small.tgz',
'tiny': 'http://39.96.43.154/ltp/v3/tiny.tgz'
'tiny': 'http://39.96.43.154/ltp/v3/tiny.tgz',
'GSD': 'http://39.96.43.154/ltp/ud/gsd.tgz',
'GSD+CRF': 'http://39.96.43.154/ltp/ud/gsd_crf.tgz',
'GSDSimp': 'http://39.96.43.154/ltp/ud/gsd.tgz',
'GSDSimp+CRF': 'http://39.96.43.154/ltp/ud/gsd_crf.tgz',
}


Expand Down Expand Up @@ -129,20 +133,22 @@ def __init__(self, path: str = 'small', device=None, **kwargs):
patch_4_1_3(ckpt)

self.cache_dir = path
config = AutoConfig.for_model(**ckpt['transformer_config'])
transformer_config = ckpt['transformer_config']
transformer_config['torchscript'] = True
config = AutoConfig.for_model(**transformer_config)
self.model = Model(ckpt['model_config'], config=config).to(self.device)
self.model.load_state_dict(ckpt['model'], strict=False)
self.model.eval()
self.max_length = self.model.transformer.config.max_position_embeddings

self.seg_vocab = ckpt.get('seg', [WORD_MIDDLE, WORD_START])
self.pos_vocab = ckpt.get('pos', [])
self.ner_vocab = ckpt.get('ner', [])
self.dep_vocab = ckpt.get('dep', [])
self.sdp_vocab = ckpt.get('sdp', [])
self.srl_vocab = [re.sub(r'ARG(\d)', r'A\1', tag.lstrip('ARGM-')) for tag in ckpt.get('srl', [])]
self.model_version = ckpt.get('version', 'unknown')
self.tokenizer = AutoTokenizer.from_pretrained(path, config=self.model.transformer.config, use_fast=True)
self.trie = Trie()
self._model_version = ckpt.get('version', None)

def __str__(self):
return f"LTP {self.version} on {self.device} (model version: {self.model_version}) "
Expand All @@ -151,14 +157,22 @@ def __repr__(self):
return f"LTP {self.version} on {self.device} (model version: {self.model_version}) "

@property
def avaliable(self):
def avaliable_models(self):
return model_map.keys()

@property
def version(self):
from ltp import __version__ as version
return version

@property
def model_version(self):
return self._model_version or 'unknown'

@property
def max_length(self):
return self.model.transformer.config.max_position_embeddings

def init_dict(self, path, max_window=None):
self.trie.init(path, max_window)

Expand Down

0 comments on commit 0ddd045

Please sign in to comment.