Skip to content

Commit

Permalink
对用户传入的已进行分词处理的数据进行二次分词以便提高准确性
Browse files Browse the repository at this point in the history
因为用户的分词结果不一定有对应的词组数据,二次分词后可能有对应的词组数据。

比如:`你要重新考虑`

用户分词结果: `['你', '要', '重新考虑']`
二次分词结果: `['你', '要', '重新', '考虑']`

没有 `重新考虑`` 这个词组的拼音数据,但是有 `重新` 这个词组的拼音数据
  • Loading branch information
mozillazg committed Apr 21, 2018
1 parent 4072b88 commit 717ce93
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 16 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Changelog
`0.30.1`_ (未发布)
++++++++++++++++++++++++

* **[Improved]** 对用户传入的已进行分词处理的数据进行二次分词以便提高准确性。
* **[Improved]** 使用 `pinyin-data`_ v0.5.1 的拼音数据。


Expand Down
1 change: 1 addition & 0 deletions pypinyin/contrib/mmseg.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __contains__(self, key):
#: >>> text = '你好,我是中国人,我爱我的祖国'
#: >>> seg.cut(text)
#: <generator object Seg.cut at 0x10b2df2b0>
#:
#: >>> list(seg.cut(text))
#: ['你好', ',', '我', '是', '中国人', ',', '我', '爱',
#: '我的', '祖', '国']
Expand Down
24 changes: 12 additions & 12 deletions pypinyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from pypinyin.contrib import mmseg
from pypinyin.utils import simple_seg, _replace_tone2_style_dict_to_default
from pypinyin.style import auto_discover, convert
from pypinyin.style import auto_discover, convert as convert_style

auto_discover()

Expand Down Expand Up @@ -77,15 +77,7 @@ def to_fixed(pinyin, style, strict=True):
:return: 根据拼音风格格式化后的拼音字符串
:rtype: unicode
"""
return convert(pinyin, style=style, strict=strict, default=pinyin)

# # 韵母
# elif style in [FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3]:
# # 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ'
# if pinyin and pinyin[0] not in [
# '\u1e3f', '\u0144', '\u0148', '\u01f9'
# ]:
# py = final(py, strict=strict)
return convert_style(pinyin, style=style, strict=strict, default=pinyin)


def _handle_nopinyin_char(chars, errors='default'):
Expand Down Expand Up @@ -217,6 +209,8 @@ def foobar(char):
:return: 拼音列表
:rtype: list
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
Usage::
>>> from pypinyin import pinyin, Style
Expand All @@ -234,9 +228,11 @@ def foobar(char):
"""
# 对字符串进行分词处理
if isinstance(hans, text_type):
hans = seg(hans)
han_list = seg(hans)
else:
han_list = chain(*(seg(x) for x in hans))
pys = []
for words in hans:
for words in han_list:
pys.extend(_pinyin(words, style, heteronym, errors, strict=strict))
return pys

Expand All @@ -256,6 +252,8 @@ def slug(hans, style=Style.NORMAL, heteronym=False, separator='-',
:param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母,详见 :ref:`strict`
:return: slug 字符串.
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
::
>>> import pypinyin
Expand Down Expand Up @@ -290,6 +288,8 @@ def lazy_pinyin(hans, style=Style.NORMAL, errors='default', strict=True):
:return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
:rtype: list
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
Usage::
>>> from pypinyin import lazy_pinyin, Style
Expand Down
2 changes: 1 addition & 1 deletion pypinyin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _seg(chars):


def simple_seg(hans):
"""将传入的字符串按是否有拼音来分割"""
"""将传入的字符串按是否是汉字来分割"""
assert not isinstance(hans, bytes_type), \
'must be unicode string or [unicode, ...] list'

Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
python_files = test_*.py
python_classes = Test
python_functions = test
addopts = -slv --tb=short --pdb
addopts = -slv --cov-report term-missing --tb=short --pdb
norecursedirs = .git __pycache__
5 changes: 5 additions & 0 deletions tests/test_pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,11 @@ def test_simple_seg():
['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
['〇', {'style': TONE}, ['líng']],
# 二次分词
['你要重新考虑', {'style': TONE}, [
'nǐ', 'yào', 'zhòng', 'xīn', 'kǎo', 'lǜ']],
[['你要', '重新考虑'], {'style': TONE}, [
'nǐ', 'yào', 'chóng', 'xīn', 'kǎo', 'lǜ']],
]


Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ envlist = py26, py27, py33, py34, py35, py36, pypy, py3_env
deps =
pytest
pytest-cov
pytest-random
pytest-random-order

[testenv]
deps = {[base]deps}
Expand All @@ -26,7 +26,7 @@ commands = py.test --cov pypinyin tests/_test_env.py
deps =
pytest<3.3.0
pytest-cov
pytest-random
pytest-random-order

[testenv:py26]
deps = {[oldpy]deps}
Expand Down

0 comments on commit 717ce93

Please sign in to comment.