更新base指标

dawin2015 · Jul 1, 2020 · 110cea5 · 110cea5
1 parent 94264cf
commit 110cea5
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ sdp = ltp.sdp(hidden)
 
 |   模型    |                      大小                      |
 | :-------: | :--------------------------------------------: |
-| Base(v2)  | [UNKNOWN](http://39.96.43.154/ltp/v2/base.tgz) |
+| Base(v2)  | [531MB](http://39.96.43.154/ltp/v2/base.tgz) |
 | Small(v2) | [170MB](http://39.96.43.154/ltp/v2/small.tgz)  |
 | Tiny(v2)  |  [34MB](http://39.96.43.154/ltp/v2/tiny.tgz)   |
 | Small(v1) | [170MB](http://39.96.43.154/ltp/v1/small.tgz)  |
@@ -42,7 +42,7 @@ sdp = ltp.sdp(hidden)
 
 |      模型       | 分词  | 词性  | 命名实体 | 语义角色 | 依存句法 | 语义依存 | 速度(句/S) |
 | :-------------: | :---: | :---: | :------: | :------: | :------: | :------: | :--------: |
-| LTP 4.0 (Base)  |       |       |          |          |          |          |            |
+| LTP 4.0 (Base)  | 98.7  | 98.4  |   96.4   |   80.0   |   90.0   |   76.5   |            |
 | LTP 4.0 (Small) | 98.4  | 98.2  |   94.3   |   78.4   |   88.3   |   74.7   |   12.58    |
 | LTP 4.0 (Tiny)  | 96.8  | 97.1  |   91.6   |   70.9   |   83.8   |   70.1   |   29.53    |
 

diff --git a/ltp/ltp.py b/ltp/ltp.py
@@ -4,6 +4,7 @@
 import os
 import torch
 import itertools
+import regex as re
 from typing import List
 
 from transformers import AutoTokenizer, cached_path
@@ -91,7 +92,7 @@ def __init__(self, path: str = 'small', batch_size: int = 10, device=None, vocab
         self.ner_vocab = ckpt['ner']
         self.dep_vocab = ckpt['dep']
         self.sdp_vocab = ckpt['sdp']
-        self.srl_vocab = ckpt['srl']
+        self.srl_vocab = [re.sub(r'ARG(\d)', r'A\1', tag) for tag in ckpt['srl']]
         self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size))
         self.tokenizer = AutoTokenizer.from_pretrained(path, config=self.model.pretrained.config, use_fast=True)
 
@@ -129,7 +130,7 @@ def sent_split(self, inputs: List[str], flag: str = "all", limit: int = 510):
     @no_gard
     def seg(self, inputs: List[str]):
         length = torch.as_tensor([len(text) for text in inputs], device=self.device)
-        tokenizerd = self.tokenizer.batch_encode_plus(inputs, return_tensors='pt')
+        tokenizerd = self.tokenizer.batch_encode_plus(inputs, return_tensors='pt', padding=True)
         pretrained_output, *_ = self.model.pretrained(
             input_ids=tokenizerd['input_ids'].to(self.device),
             attention_mask=tokenizerd['attention_mask'].to(self.device),