forked from shibing624/pycorrector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
67 lines (61 loc) · 1.99 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 配置切词器
"""
import logging
import os
import jieba
from jieba import posseg
def segment(sentence, cut_type='word', pos=False):
"""
切词
:param sentence:
:param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
:param pos: enable POS
:return: list
"""
if pos:
if cut_type == 'word':
word_pos_seq = posseg.lcut(sentence)
word_seq, pos_seq = [], []
for w, p in word_pos_seq:
word_seq.append(w)
pos_seq.append(p)
return word_seq, pos_seq
elif cut_type == 'char':
word_seq = list(sentence)
pos_seq = []
for w in word_seq:
w_p = posseg.lcut(w)
pos_seq.append(w_p[0].flag)
return word_seq, pos_seq
else:
if cut_type == 'word':
return jieba.lcut(sentence)
elif cut_type == 'char':
return list(sentence)
class Tokenizer(object):
def __init__(self, dict_path='', custom_word_freq_dict=None, custom_confusion_dict=None):
self.model = jieba
self.model.default_logger.setLevel(logging.ERROR)
# 初始化大词典
if os.path.exists(dict_path):
self.model.set_dictionary(dict_path)
# 加载用户自定义词典
if custom_word_freq_dict:
for w, f in custom_word_freq_dict.items():
self.model.add_word(w, freq=f)
# 加载混淆集词典
if custom_confusion_dict:
for k, word in custom_confusion_dict.items():
# 添加到分词器的自定义词典中
self.model.add_word(k)
self.model.add_word(word)
def tokenize(self, sentence):
"""
切词并返回切词位置
:param sentence:
:return: (word, start_index, end_index) model='default'
"""
return list(self.model.tokenize(sentence))