Skip to content

Commit

Permalink
update test and replace_aug
Browse files Browse the repository at this point in the history
  • Loading branch information
dongrixinyu committed Mar 18, 2021
1 parent 25c7b9f commit 44f4615
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 3 deletions.
1 change: 0 additions & 1 deletion TODO.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
- 标注数据集 docker 镜像维护
- 数据增强 - 实体替换
- 数据增强 - 随机替换非关键字符
- 数据增强 - 其他
- 时间解析
Expand Down
7 changes: 5 additions & 2 deletions jionlp/textaug/replace_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,11 @@ def _augment_one(self, text, entities):
if self.random.random() < self.replace_ratio:
# 将该实体从词典中随机选择一个做替换
orig_entity = self.random.choice(entities)
new_entity_text = self.random.choice(
list(self.entities_dict[orig_entity['type']].keys()))

candidate_list = list(self.entities_dict[orig_entity['type']].keys())
if len(candidate_list) == 0:
continue
new_entity_text = self.random.choice(candidate_list)

orig_len = len(orig_entity['text'])
new_len = len(new_entity_text)
Expand Down
19 changes: 19 additions & 0 deletions test/test_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

import unittest

from test_text_aug import TestTextAug


if __name__ == '__main__':

suite = unittest.TestSuite()
test_text_aug = [TestTextAug('test_ReplaceEntity')]
suite.addTests(test_text_aug)



runner = unittest.TextTestRunner(verbosity=1)
runner.run(suite)



51 changes: 51 additions & 0 deletions test/test_text_aug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@


import unittest

import jionlp as jio


class TestTextAug(unittest.TestCase):
""" 测试文本数据增强工具 """

def test_ReplaceEntity(self):
""" test class ReplaceEntity """

# 准备的词典
entities_dict = {
"Person": {"马成宇": 1},
"Company": {"百度": 4, "国力教育公司": 1},
"Organization": {"延平区人民法院": 1}
}
# 输入的序列标注样本
text = '腾讯致力于解决冲突,阿里巴巴致力于玩。小马爱玩。'
entities = [{'type': 'Company', 'text': '腾讯', 'offset': (0, 2)},
{'type': 'Company', 'text': '阿里巴巴', 'offset': (10, 14)},
{'type': 'Person', 'text': '小马', 'offset': (19, 21)}]
replace_entity = jio.ReplaceEntity(entities_dict)
texts, entities = replace_entity(text, entities)

# 预期结果
standard_texts = ['腾讯致力于解决冲突,国力教育公司致力于玩。小马爱玩。',
'百度致力于解决冲突,阿里巴巴致力于玩。小马爱玩。',
'腾讯致力于解决冲突,阿里巴巴致力于玩。马成宇爱玩。']
standard_entities = [
[{'type': 'Company', 'text': '腾讯', 'offset': (0, 2)},
{'text': '国力教育公司', 'type': 'Company', 'offset': [10, 16]},
{'text': '小马', 'type': 'Person', 'offset': (21, 23)}],
[{'text': '百度', 'type': 'Company', 'offset': [0, 2]},
{'text': '阿里巴巴', 'type': 'Company', 'offset': (10, 14)},
{'text': '小马', 'type': 'Person', 'offset': (19, 21)}],
[{'type': 'Company', 'text': '腾讯', 'offset': (0, 2)},
{'type': 'Company', 'text': '阿里巴巴', 'offset': (10, 14)},
{'text': '马成宇', 'type': 'Person', 'offset': [19, 22]}]]

self.assertEqual(texts, standard_texts)
self.assertEqual(entities, standard_entities)

# def test_





0 comments on commit 44f4615

Please sign in to comment.