Skip to content

Commit

Permalink
update time_parser
Browse files Browse the repository at this point in the history
  • Loading branch information
dongrixinyu committed Oct 11, 2021
1 parent 36d8d0f commit ba9714a
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 31 deletions.
28 changes: 5 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<a alt="Downloads">
<img src="https://img.shields.io/badge/downloads-5k-yellow" /></a>
<a alt="Version">
<img src="https://img.shields.io/badge/version-1.3.36-green" /></a>
<img src="https://img.shields.io/badge/version-1.3.37-green" /></a>
<a href="https://github.com/dongrixinyu/JioNLP/pulse" alt="Activity">
<img src="https://img.shields.io/github/commit-activity/m/dongrixinyu/JioNLP?color=blue" /></a>
</p>
Expand Down Expand Up @@ -47,7 +47,7 @@ print(res)

```

#### Update 2021-09-20
#### Update 2021-10-11
## 新增 [时间语义解析](../../wiki/时间语义解析-说明文档#user-content-时间语义解析)

#### jio.parse_time 给定时间字符串,解析其为时间戳、时长等。
Expand Down Expand Up @@ -80,26 +80,8 @@ print(res)
- 目前支持的所有 [测试用例](../../blob/master/test/test_time_parser.py)

#### 完整示例
![image](../../blob/master/image/time_entity.png)
- 一般首先对文本进行时间类型的实体识别,得到如上图的时间实体。然后选定其中的 time_base(选定新闻发布时间2021-07-15 09:03:47),即可采用本工具处理,得到结果如下:
```
7月15日 time_point ['2021-07-15 00:00:00', '2021-07-15 23:59:59']
今年上半年 time_span ['2021-01-01 00:00:00', '2021-06-30 23:59:59']
两年 time_delta {'year': 2.0}
一季度 time_span ['2021-01-01 00:00:00', '2021-03-31 23:59:59']
二季度 time_span ['2021-04-01 00:00:00', '2021-06-30 23:59:59']
上半年 time_span ['2021-01-01 00:00:00', '2021-06-30 23:59:59']
春节 time_point ['2021-02-12 00:00:00', '2021-02-12 23:59:59']
五一 time_point ['2021-05-01 00:00:00', '2021-05-01 23:59:59']
端午 time_point ['2021-06-14 00:00:00', '2021-06-14 23:59:59']
6月份 time_point ['2021-06-01 00:00:00', '2021-06-30 23:59:59']
16个月 time_delta {'month': 16.0}
从2018年至今 time_span ['2018-01-01 00:00:00', '2021-07-15 09:03:47']
每年 time_period {'delta': {'year': 1}, 'point': None}
1-5月份 time_span ['2021-01-01 00:00:00', '2021-05-31 23:59:59']
2020年1-5月份 time_span ['2020-01-01 00:00:00', '2020-05-31 23:59:59']
2021-07-15 09:03:47 time_point ['2021-07-15 09:03:47', '2021-07-15 09:03:47']
```
[JioNLP在线版-时间抽取与解析](http://182.92.160.94:16666/#/extract_time)


## 安装 Installation

Expand Down Expand Up @@ -258,6 +240,6 @@ $ jio_help
### 做 NLP不易,欢迎加入自然语言处理 Wechat 交流群
### 如以下码失效,请先添加 vx:dongrixinyu89
![image](../../blob/master/image/qr_code_for_collection.png)
### 如您感觉本工具对您有帮助,可以扫码请作者喝杯咖啡 (●'◡'●)
### 如本工具对您有帮助,可以扫码请作者喝杯咖啡 (●'◡'●)
![image](../../blob/master/image/payment_code.png)

Binary file modified image/qr_code_for_collection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion jionlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# description: Preprocessing tool for Chinese NLP
"""

__version__ = '1.3.36'
__version__ = '1.3.37'

import os

Expand Down
32 changes: 27 additions & 5 deletions jionlp/gadget/time_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def _preprocess_regular_expression(self):
# --------- TIME POINT & TIME SPAN ---------
# `标准数字 年、月、日`:`2016-05-22`、`1987.12-3`
self.standard_year_month_day_pattern = re.compile(
r'((17|18|19|20|21)\d{2})[\-./](1[012]|[0]?\d)([\-./](30|31|[012]?\d))?[ \t\u3000]?|'
r'((17|18|19|20|21)\d{2})[\-./](1[012]|[0]?\d)([\-./](30|31|[012]?\d))?[ \t\u3000\-./]?|'
r'(1[012]|[0]?\d)[·\-](30|31|[012]?\d)')

# `标准数字 年`:`2018`
Expand Down Expand Up @@ -754,7 +754,7 @@ def _compare_handler(first_handler, second_handler):

@staticmethod
def _cleansing(time_string):
return time_string.strip()# .replace(' ', '')
return time_string.strip() # .replace(' ', '')

def __call__(self, time_string, time_base=time.time(), time_type=None,
ret_type='str', strict=False, virtual_time=False, ret_future=False):
Expand Down Expand Up @@ -916,7 +916,7 @@ def _adjust_underlying_future_time(self, time_string):
time_string = '下' + time_string
else:
pass
print(time_string)

return time_string

def parse_time_span_point(self, time_string):
Expand Down Expand Up @@ -1089,8 +1089,13 @@ def _seg_or_not_first(self, time_string):
# 强制不 seg pattern
no_seg_patterns = [self.time_span_no_seg_standard_year_month_day]
for pattern in no_seg_patterns:
matched_string = TimeParser.parse_pattern(time_string, pattern)
if matched_string is not None and matched_string != '':
# matched_string = TimeParser.parse_pattern(time_string, pattern)
searched_res = pattern.search(time_string)
if searched_res:
start_idx = searched_res.span()[0]
end_idx = searched_res.span()[1]
time_string = time_string[start_idx: end_idx].replace('-', '䶵')

# 匹配到后,须进行替换
time_string = time_string.replace('-', '䶵')
break
Expand Down Expand Up @@ -1654,6 +1659,23 @@ def normalize_special_time_span(self, time_string):

def normalize_standard_year_month_day(self, time_string):
""" 解析 标准数字 年月日(标准) 时间 """
# 清洗 time_string 的边缘杂字符串,如`2018-02-09-`,其原字符串可能为
# `2018-02-09-11:20`
def pattern_strip(ymd_segs, time_string):
head = ymd_segs.search(time_string[0])
tail = ymd_segs.search(time_string[-1])
while head or tail:
if head:
time_string = time_string[1:]
if tail:
time_string = time_string[:-1]
head = ymd_segs.search(time_string[0])
tail = ymd_segs.search(time_string[-1])

return time_string

time_string = pattern_strip(self.ymd_segs, time_string)

colon_num = len(self.ymd_segs.findall(time_string))
if colon_num == 2:
year, month, day = self.ymd_segs.split(time_string)
Expand Down
4 changes: 2 additions & 2 deletions jionlp/rule/rule_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@


# 将时间进行转换
DELTA_SUB = r'([之以]?[内前后上下来])'

# DELTA_SUB = r'([之以]?[内前后上下来])'
DELTA_SUB = r'([之以]?[内前后来])'

########################################################################
# 时间 NER 字符规则
Expand Down
2 changes: 2 additions & 0 deletions test/test_time_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_time_parser(self):
['6·30', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-30 00:00:00', '2021-06-30 23:59:59']}],
['2018', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2018-01-01 00:00:00', '2018-12-31 23:59:59']}],
['2021-09-0910:09', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-09 10:09:00', '2021-09-09 10:09:59']}],
['2021-09-12-11:23', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-12 11:23:00', '2021-09-12 11:23:59']}],
['09-01 20:01', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-01 20:01:00', '2021-09-01 20:01:59']}],
['09-01 20:01 至 12-01 18:07', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-09-01 20:01:00', '2021-12-01 18:07:59']}],
['09-01', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-01 00:00:00', '2021-09-01 23:59:59']}],
Expand Down Expand Up @@ -252,6 +253,7 @@ def test_time_parser(self):
['7月4日晚上7点09分18秒', {'year': 2021}, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-07-04 19:09:18', '2021-07-04 19:09:18']}],
['去年7月4日晚上7点09分', {'year': 2021}, {'type': 'time_point', 'definition': 'accurate', 'time': ['2020-07-04 19:09:00', '2020-07-04 19:09:59']}],
['早上7点', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-14 07:00:00', '2021-06-14 07:59:59']}],
['9日上午', _ts_1, {'type': 'time_point', 'definition': 'blur', 'time': ['2021-06-09 07:00:00', '2021-06-09 11:59:59']}],
['下月15号下午6点', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-07-15 18:00:00', '2021-07-15 18:59:59']}],
# 存在6点前,包不包含6点的问题,须设置参数判定
['下月15号下午6点前', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-06-14 01:06:40', '2021-07-15 18:59:59']}],
Expand Down

0 comments on commit ba9714a

Please sign in to comment.