Skip to content

Commit

Permalink
update rules for clean_text
Browse files Browse the repository at this point in the history
update rules for clean_text
  • Loading branch information
dongrixinyu committed Aug 18, 2021
1 parent e1742cc commit 42761e2
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 9 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<a alt="Downloads">
<img src="https://img.shields.io/badge/downloads-4k-yellow" /></a>
<a alt="Version">
<img src="https://img.shields.io/badge/version-1.3.27-green" /></a>
<img src="https://img.shields.io/badge/version-1.3.28-green" /></a>
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
</p>
Expand Down Expand Up @@ -95,6 +95,7 @@ $ pip install .
$ pip install jionlp
```


## 使用 Features

- 导入工具包,查看工具包的主要功能与函数注释
Expand Down Expand Up @@ -230,7 +231,7 @@ $ jio_help
- 如感兴趣合作完善本工具包,请参考 **TODO.txt** 文件进行功能添加。

### 做 NLP不易,欢迎加入自然语言处理 Wechat 交流群
### 请先添加vx:dongrixinyu89
### 如以下码失效,请先添加vx:dongrixinyu89
![image](../../blob/master/image/qr_code_for_collection.png)


Binary file modified image/qr_code_for_collection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion jionlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# description: Preprocessing tool for Chinese NLP
"""

__version__ = '1.3.27'
__version__ = '1.3.28'

import os

Expand Down
4 changes: 2 additions & 2 deletions jionlp/gadget/time_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4473,14 +4473,14 @@ def string2handler(datetime_obj):
29, leap_month) # 当农历月无30 天时,按 29天计算

return string2handler(first_solar_time_handler),\
string2handler(second_solar_time_handler)
string2handler(second_solar_time_handler)

else:
solar_time_handler = self.lunar2solar(
lunar_time_handler[0], lunar_time_handler[1],
lunar_time_handler[2], leap_month)
return string2handler(solar_time_handler),\
string2handler(solar_time_handler)
string2handler(solar_time_handler)

def _parse_solar_terms(self, year, solar_term):
"""解析24节气
Expand Down
6 changes: 3 additions & 3 deletions jionlp/rule/rule_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
# 以及相应的常用符号,单字节符号、标点符号等。而日文、俄文、拉丁、希腊、数学公式、
# 物理单位等符号 绝大多数不常用的都被丢弃。其中 㐀-䶵 指的是另一个汉字字符集
# 仅保留了常用符号,数字标识,如 ① 等
UNICODE_EXCEPTION_PATTERN = '[^‐-”•…‰※℃℉Ⅰ-ⅹ①-⒛\u3000-】〔-〞㈠-㈩一-龥﹐-﹫!-~¢£¥]'
UNICODE_EXCEPTION_PATTERN = '[^‐-”•·・…‰※℃℉Ⅰ-ⅹ①-⒛\u3000-】〔-〞㈠-㈩一-龥﹐-﹫!-~¢£¥]'
EXCEPTION_PATTERN = ASCII_EXCEPTION_PATTERN[:-1] + UNICODE_EXCEPTION_PATTERN[2:]

# ---------------------------------------------------------------------
Expand Down Expand Up @@ -155,7 +155,7 @@
# ---------------------------------------------------------------------
# 冗余字符处理
# 文本中有连续的 “哈哈哈哈哈” 等字符串,需要删除冗余字符串,返回为 “哈”
REDUNDANT_PATTERN = ' -\t\n啊哈呀~\u3000\xa0'
REDUNDANT_PATTERN = ' -\t\n啊哈呀~\u3000\xa0•·・'

# ---------------------------------------------------------------------
# 纯数字格式,用于过滤停用词时,过滤掉纯数字(包括汉字数字)
Expand Down Expand Up @@ -305,7 +305,7 @@
# 固定公历节日
FIXED_SOLAR_FESTIVAL = r'((元旦|十一)|(三八|五一|六一|七一|八一|国庆|圣诞)(节)?|'\
r'((三八)?妇女|女神|植树|(五一)?劳动|(五四)?青年|(六一)?儿童|(七一)?建党|(八一)?建军|教师|情人|愚人|万圣|护士)节|'\
r'地球日|三[\.·]?一五|双(十一|11)|(.{1,4})?消费者权益日)'
r'地球日|三[\.•·・]?一五|双(十一|11)|(.{1,4})?消费者权益日)'
# 固定农历节日
FIXED_LUNAR_FESTIVAL = r'((春|填仓|上巳|寒食|清明|浴佛|姑姑|财神|下元|寒衣)节|'\
r'(龙抬头|除夕)|'\
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
__maintainer__ = "dongrixinyu"
__email__ = "[email protected]"
__url__ = 'https://github.com/dongrixinyu/JioNLP'
__description__ = 'Preprocessing tool for Chinese NLP'
__description__ = 'Chinese NLPreprocessing & Parsing'


with open(os.path.join(DIR_PATH, 'requirements.txt'),
Expand Down

0 comments on commit 42761e2

Please sign in to comment.