Skip to content

Commit

Permalink
补充 MSRA NER语料库示例
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Mar 16, 2020
1 parent 4f2ca6a commit d1629e9
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 0 deletions.
40 changes: 40 additions & 0 deletions tests/book/ch08/demo_msra_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-07-29 15:06
# 《自然语言处理入门》8.5.3 基于感知机序列标注的命名实体识别
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
from pyhanlp import *
from tests.book.ch07.demo_perceptron_pos import POSTrainer
from tests.book.ch08.demo_hmm_ner import PerceptronSegmenter, PerceptronPOSTagger
from tests.book.ch08.demo_sp_ner import PerceptronNERecognizer, NERTrainer
from tests.book.ch08.msra_ner import MSRA_NER_TRAIN


def train_ner(corpus):
model = os.path.join(os.path.dirname(corpus), 'ner.bin')
if os.path.isfile(model):
return PerceptronNERecognizer(model)
trainer = NERTrainer()
trainer.tagSet.nerLabels.clear() # 不识别nr、ns、nt
trainer.tagSet.nerLabels.addAll(
["AGE", "ANGLE", "AREA", "CAPACTITY", "DATE", "DECIMAL", "DURATION", "FRACTION", "FREQUENCY", "INTEGER",
"LENGTH", "LOCATION", "MEASURE", "MONEY", "ORDINAL", "ORGANIZATION", "PERCENT", "PERSON", "PHONE",
"POSTALCODE", "RATE", "SPEED", "TEMPERATURE", "TIME", "WEIGHT", "WWW"])
return PerceptronNERecognizer(trainer.train(corpus, model).getModel())


def train_pos(corpus):
model = os.path.join(os.path.dirname(corpus), 'pos.bin')
if os.path.isfile(model):
return PerceptronPOSTagger(model)
trainer = POSTrainer()
return PerceptronPOSTagger(trainer.train(corpus, model).getModel())


if __name__ == '__main__':
tagger = train_pos(MSRA_NER_TRAIN)
recognizer = train_ner(MSRA_NER_TRAIN)
analyzer = PerceptronLexicalAnalyzer(PerceptronSegmenter(), tagger, recognizer)
analyzer.enableCustomDictionary(False)
print(analyzer.analyze('2008年5月20日山东大连气温30多摄氏度,王莲香首场赢下李钊颖,中国女队有机会赢下韩国队'))
14 changes: 14 additions & 0 deletions tests/book/ch08/msra_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-07-04 17:41
# 《自然语言处理入门》8.5.3 基于感知机序列标注的命名实体识别
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
import os

from tests.test_utility import ensure_data

MSRA_NER = ensure_data("msra-ne", "http://file.hankcs.com/corpus/msra-ne.zip")
MSRA_NER_TRAIN = os.path.join(MSRA_NER, 'train.txt')
MSRA_NER_TEST = os.path.join(MSRA_NER, 'test.txt')
MSRA_NER_MODEL = os.path.join(MSRA_NER, 'model.bin')

0 comments on commit d1629e9

Please sign in to comment.