Skip to content

Commit

Permalink
Merge pull request fxsjy#309 from gumblex/master
Browse files Browse the repository at this point in the history
用 pkg_resources 载入默认字典
  • Loading branch information
fxsjy committed Nov 13, 2015
2 parents 70f019b + 8814e08 commit f73a218
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 63 deletions.
36 changes: 23 additions & 13 deletions jieba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
else:
_replace_file = os.rename

_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
os.path.dirname(__file__), path))
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))

DEFAULT_DICT = _get_module_path("dict.txt")
DEFAULT_DICT = None
DEFAULT_DICT_NAME = "dict.txt"

log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -54,7 +53,10 @@ class Tokenizer(object):

def __init__(self, dictionary=DEFAULT_DICT):
self.lock = threading.RLock()
self.dictionary = _get_abs_path(dictionary)
if dictionary == DEFAULT_DICT:
self.dictionary = dictionary
else:
self.dictionary = _get_abs_path(dictionary)
self.FREQ = {}
self.total = 0
self.user_word_tag_tab = {}
Expand All @@ -65,10 +67,11 @@ def __init__(self, dictionary=DEFAULT_DICT):
def __repr__(self):
return '<Tokenizer dictionary=%r>' % self.dictionary

def gen_pfdict(self, f_name):
def gen_pfdict(self, f):
lfreq = {}
ltotal = 0
with open(f_name, 'rb') as f:
f_name = resolve_filename(f)
with f:
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode('utf-8')
Expand Down Expand Up @@ -105,7 +108,7 @@ def initialize(self, dictionary=None):
if self.initialized:
return

default_logger.debug("Building prefix dict from %s ..." % abs_path)
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
t1 = time.time()
if self.cache_file:
cache_file = self.cache_file
Expand All @@ -122,7 +125,8 @@ def initialize(self, dictionary=None):
tmpdir = os.path.dirname(cache_file)

load_from_cache_fail = True
if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
default_logger.debug(
"Loading model from cache %s" % cache_file)
try:
Expand All @@ -136,7 +140,7 @@ def initialize(self, dictionary=None):
wlock = DICT_WRITING.get(abs_path, threading.RLock())
DICT_WRITING[abs_path] = wlock
with wlock:
self.FREQ, self.total = self.gen_pfdict(abs_path)
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
default_logger.debug(
"Dumping model to file cache %s" % cache_file)
try:
Expand Down Expand Up @@ -343,8 +347,11 @@ def _lcut_all(self, sentence):
def _lcut_for_search_no_hmm(self, sentence):
return self.lcut_for_search(sentence, False)

def get_abs_path_dict(self):
return _get_abs_path(self.dictionary)
def get_dict_file(self):
if self.dictionary == DEFAULT_DICT:
return get_module_res(DEFAULT_DICT_NAME)
else:
return open(self.dictionary, 'rb')

def load_userdict(self, f):
'''
Expand All @@ -363,14 +370,17 @@ def load_userdict(self, f):
'''
self.check_initialized()
if isinstance(f, string_types):
f_name = f
f = open(f, 'rb')
else:
f_name = resolve_filename(f)
for lineno, ln in enumerate(f, 1):
line = ln.strip()
if not isinstance(line, text_type):
try:
line = line.decode('utf-8').lstrip('\ufeff')
except UnicodeDecodeError:
raise ValueError('dictionary file %s must be utf-8' % f.name)
raise ValueError('dictionary file %s must be utf-8' % f_name)
if not line:
continue
# match won't be None because there's at least one character
Expand Down Expand Up @@ -494,7 +504,7 @@ def set_dictionary(self, dictionary_path):
lcut_for_search = dt.lcut_for_search
del_word = dt.del_word
get_DAG = dt.get_DAG
get_abs_path_dict = dt.get_abs_path_dict
get_dict_file = dt.get_dict_file
initialize = dt.initialize
load_userdict = dt.load_userdict
set_dictionary = dt.set_dictionary
Expand Down
15 changes: 15 additions & 0 deletions jieba/_compat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# -*- coding: utf-8 -*-
import os
import sys

try:
import pkg_resources
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
os.path.join(*res))
except ImportError:
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')

PY2 = sys.version_info[0] == 2

default_encoding = sys.getfilesystemencoding()
Expand Down Expand Up @@ -29,3 +38,9 @@ def strdecode(sentence):
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
return sentence

def resolve_filename(f):
try:
return f.name
except AttributeError:
return repr(f)
23 changes: 4 additions & 19 deletions jieba/finalseg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import absolute_import, unicode_literals
import re
import os
import marshal
import sys
import pickle
from .._compat import *

MIN_FLOAT = -3.14e100
Expand All @@ -21,24 +21,9 @@


def load_model():
_curpath = os.path.normpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))

start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)

trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)

emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)

start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
return start_p, trans_p, emit_p

if sys.platform.startswith("java"):
Expand Down
Binary file modified jieba/finalseg/prob_emit.p
Binary file not shown.
Binary file modified jieba/finalseg/prob_start.p
Binary file not shown.
Binary file modified jieba/finalseg/prob_trans.p
Binary file not shown.
44 changes: 13 additions & 31 deletions jieba/posseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import sys
import jieba
import marshal
import pickle
from .._compat import *
from .viterbi import viterbi

Expand All @@ -23,36 +23,17 @@
re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)


def load_model(f_name):
_curpath = os.path.normpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
def load_model():
# For Jython
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)

trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)

emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)

state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with open(abs_path, 'rb') as f:
state = marshal.load(f)
f.closed

return state, start_p, trans_p, emit_p, result
start_p = pickle.load(get_module_res("posseg", PROB_START_P))
trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P))
emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P))
state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P))
return state, start_p, trans_p, emit_p


if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model()
char_state_tab_P, start_P, trans_P, emit_P = load_model()
else:
from .char_state_tab import P as char_state_tab_P
from .prob_start import P as start_P
Expand Down Expand Up @@ -89,7 +70,7 @@ class POSTokenizer(object):

def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or jieba.Tokenizer()
self.load_word_tag(self.tokenizer.get_abs_path_dict())
self.load_word_tag(self.tokenizer.get_dict_file())

def __repr__(self):
return '<POSTokenizer tokenizer=%r>' % self.tokenizer
Expand All @@ -102,11 +83,12 @@ def __getattr__(self, name):

def initialize(self, dictionary=None):
self.tokenizer.initialize(dictionary)
self.load_word_tag(self.tokenizer.get_abs_path_dict())
self.load_word_tag(self.tokenizer.get_dict_file())

def load_word_tag(self, f_name):
def load_word_tag(self, f):
self.word_tag_tab = {}
with open(f_name, "rb") as f:
f_name = resolve_filename(f)
with f:
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode("utf-8")
Expand Down
Binary file modified jieba/posseg/char_state_tab.p
Binary file not shown.
Binary file modified jieba/posseg/prob_emit.p
Binary file not shown.
Binary file modified jieba/posseg/prob_start.p
Binary file not shown.
Binary file modified jieba/posseg/prob_trans.p
Binary file not shown.

0 comments on commit f73a218

Please sign in to comment.