diff --git a/README.md b/README.md index 7662ef6..da3c4ee 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - +

@@ -47,7 +47,7 @@ print(res) ``` -#### Update 2021-09-20 +#### Update 2021-10-11 ## 新增 [时间语义解析](../../wiki/时间语义解析-说明文档#user-content-时间语义解析) #### jio.parse_time 给定时间字符串,解析其为时间戳、时长等。 @@ -80,26 +80,8 @@ print(res) - 目前支持的所有 [测试用例](../../blob/master/test/test_time_parser.py) #### 完整示例 -![image](../../blob/master/image/time_entity.png) -- 一般首先对文本进行时间类型的实体识别,得到如上图的时间实体。然后选定其中的 time_base(选定新闻发布时间2021-07-15 09:03:47),即可采用本工具处理,得到结果如下: -``` -7月15日 time_point ['2021-07-15 00:00:00', '2021-07-15 23:59:59'] -今年上半年 time_span ['2021-01-01 00:00:00', '2021-06-30 23:59:59'] -两年 time_delta {'year': 2.0} -一季度 time_span ['2021-01-01 00:00:00', '2021-03-31 23:59:59'] -二季度 time_span ['2021-04-01 00:00:00', '2021-06-30 23:59:59'] -上半年 time_span ['2021-01-01 00:00:00', '2021-06-30 23:59:59'] -春节 time_point ['2021-02-12 00:00:00', '2021-02-12 23:59:59'] -五一 time_point ['2021-05-01 00:00:00', '2021-05-01 23:59:59'] -端午 time_point ['2021-06-14 00:00:00', '2021-06-14 23:59:59'] -6月份 time_point ['2021-06-01 00:00:00', '2021-06-30 23:59:59'] -16个月 time_delta {'month': 16.0} -从2018年至今 time_span ['2018-01-01 00:00:00', '2021-07-15 09:03:47'] -每年 time_period {'delta': {'year': 1}, 'point': None} -1-5月份 time_span ['2021-01-01 00:00:00', '2021-05-31 23:59:59'] -2020年1-5月份 time_span ['2020-01-01 00:00:00', '2020-05-31 23:59:59'] -2021-07-15 09:03:47 time_point ['2021-07-15 09:03:47', '2021-07-15 09:03:47'] -``` +[JioNLP在线版-时间抽取与解析](http://182.92.160.94:16666/#/extract_time) + ## 安装 Installation @@ -258,6 +240,6 @@ $ jio_help ### 做 NLP不易,欢迎加入自然语言处理 Wechat 交流群 ### 如以下码失效,请先添加 vx:dongrixinyu89 ![image](../../blob/master/image/qr_code_for_collection.png) -### 如您感觉本工具对您有帮助,可以扫码请作者喝杯咖啡 (●'◡'●) +### 如本工具对您有帮助,可以扫码请作者喝杯咖啡 (●'◡'●) ![image](../../blob/master/image/payment_code.png) diff --git a/image/qr_code_for_collection.png b/image/qr_code_for_collection.png index 8bdb202..dea110b 100644 Binary files a/image/qr_code_for_collection.png and b/image/qr_code_for_collection.png differ diff --git a/jionlp/__init__.py b/jionlp/__init__.py index 254e614..7d95fc3 100644 --- a/jionlp/__init__.py +++ b/jionlp/__init__.py @@ -8,7 +8,7 @@ # description: Preprocessing tool for Chinese NLP """ -__version__ = '1.3.36' +__version__ = '1.3.37' import os diff --git a/jionlp/gadget/time_parser.py b/jionlp/gadget/time_parser.py index 8058108..7455b0f 100644 --- a/jionlp/gadget/time_parser.py +++ b/jionlp/gadget/time_parser.py @@ -238,7 +238,7 @@ def _preprocess_regular_expression(self): # --------- TIME POINT & TIME SPAN --------- # `标准数字 年、月、日`:`2016-05-22`、`1987.12-3` self.standard_year_month_day_pattern = re.compile( - r'((17|18|19|20|21)\d{2})[\-./](1[012]|[0]?\d)([\-./](30|31|[012]?\d))?[ \t\u3000]?|' + r'((17|18|19|20|21)\d{2})[\-./](1[012]|[0]?\d)([\-./](30|31|[012]?\d))?[ \t\u3000\-./]?|' r'(1[012]|[0]?\d)[·\-](30|31|[012]?\d)') # `标准数字 年`:`2018` @@ -754,7 +754,7 @@ def _compare_handler(first_handler, second_handler): @staticmethod def _cleansing(time_string): - return time_string.strip()# .replace(' ', '') + return time_string.strip() # .replace(' ', '') def __call__(self, time_string, time_base=time.time(), time_type=None, ret_type='str', strict=False, virtual_time=False, ret_future=False): @@ -916,7 +916,7 @@ def _adjust_underlying_future_time(self, time_string): time_string = '下' + time_string else: pass - print(time_string) + return time_string def parse_time_span_point(self, time_string): @@ -1089,8 +1089,13 @@ def _seg_or_not_first(self, time_string): # 强制不 seg pattern no_seg_patterns = [self.time_span_no_seg_standard_year_month_day] for pattern in no_seg_patterns: - matched_string = TimeParser.parse_pattern(time_string, pattern) - if matched_string is not None and matched_string != '': + # matched_string = TimeParser.parse_pattern(time_string, pattern) + searched_res = pattern.search(time_string) + if searched_res: + start_idx = searched_res.span()[0] + end_idx = searched_res.span()[1] + time_string = time_string[start_idx: end_idx].replace('-', '䶵') + # 匹配到后,须进行替换 time_string = time_string.replace('-', '䶵') break @@ -1654,6 +1659,23 @@ def normalize_special_time_span(self, time_string): def normalize_standard_year_month_day(self, time_string): """ 解析 标准数字 年月日(标准) 时间 """ + # 清洗 time_string 的边缘杂字符串,如`2018-02-09-`,其原字符串可能为 + # `2018-02-09-11:20` + def pattern_strip(ymd_segs, time_string): + head = ymd_segs.search(time_string[0]) + tail = ymd_segs.search(time_string[-1]) + while head or tail: + if head: + time_string = time_string[1:] + if tail: + time_string = time_string[:-1] + head = ymd_segs.search(time_string[0]) + tail = ymd_segs.search(time_string[-1]) + + return time_string + + time_string = pattern_strip(self.ymd_segs, time_string) + colon_num = len(self.ymd_segs.findall(time_string)) if colon_num == 2: year, month, day = self.ymd_segs.split(time_string) diff --git a/jionlp/rule/rule_pattern.py b/jionlp/rule/rule_pattern.py index fd78dec..793cbb8 100644 --- a/jionlp/rule/rule_pattern.py +++ b/jionlp/rule/rule_pattern.py @@ -342,8 +342,8 @@ # 将时间进行转换 -DELTA_SUB = r'([之以]?[内前后上下来])' - +# DELTA_SUB = r'([之以]?[内前后上下来])' +DELTA_SUB = r'([之以]?[内前后来])' ######################################################################## # 时间 NER 字符规则 diff --git a/test/test_time_parser.py b/test/test_time_parser.py index ef9533d..9904331 100644 --- a/test/test_time_parser.py +++ b/test/test_time_parser.py @@ -32,6 +32,7 @@ def test_time_parser(self): ['6·30', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-30 00:00:00', '2021-06-30 23:59:59']}], ['2018', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2018-01-01 00:00:00', '2018-12-31 23:59:59']}], ['2021-09-0910:09', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-09 10:09:00', '2021-09-09 10:09:59']}], + ['2021-09-12-11:23', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-12 11:23:00', '2021-09-12 11:23:59']}], ['09-01 20:01', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-01 20:01:00', '2021-09-01 20:01:59']}], ['09-01 20:01 至 12-01 18:07', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-09-01 20:01:00', '2021-12-01 18:07:59']}], ['09-01', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-09-01 00:00:00', '2021-09-01 23:59:59']}], @@ -252,6 +253,7 @@ def test_time_parser(self): ['7月4日晚上7点09分18秒', {'year': 2021}, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-07-04 19:09:18', '2021-07-04 19:09:18']}], ['去年7月4日晚上7点09分', {'year': 2021}, {'type': 'time_point', 'definition': 'accurate', 'time': ['2020-07-04 19:09:00', '2020-07-04 19:09:59']}], ['早上7点', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-06-14 07:00:00', '2021-06-14 07:59:59']}], + ['9日上午', _ts_1, {'type': 'time_point', 'definition': 'blur', 'time': ['2021-06-09 07:00:00', '2021-06-09 11:59:59']}], ['下月15号下午6点', _ts_1, {'type': 'time_point', 'definition': 'accurate', 'time': ['2021-07-15 18:00:00', '2021-07-15 18:59:59']}], # 存在6点前,包不包含6点的问题,须设置参数判定 ['下月15号下午6点前', _ts_1, {'type': 'time_span', 'definition': 'accurate', 'time': ['2021-06-14 01:06:40', '2021-07-15 18:59:59']}],