Skip to content

Commit

Permalink
Merge pull request chinese-poetry#302 from chienmy/master
Browse files Browse the repository at this point in the history
提交经过初步校对的版本
  • Loading branch information
jackeyGao authored May 11, 2022
2 parents 2798d26 + 71d0646 commit 88ac4a5
Show file tree
Hide file tree
Showing 24 changed files with 3,282 additions and 3,244 deletions.
5 changes: 5 additions & 0 deletions ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,8 @@ sqlite> select * from ciauthor limit 1;
1|苏轼|苏轼:(1037-1101)北宋文学家、书画家...
```

## 已知问题

1. 尚存在一些繁体字,由于找不到对应的简化字,或是不能确定是否应当简化,仍保留在词中。
2. 部分异体字、通用字,不及一一对照原本考证用法,只得保留。例如:````,`酴醿``酴醾``溟濛``溟蒙``鸿濛``鸿蒙`等。

67 changes: 50 additions & 17 deletions ci/UpdateCi.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,54 @@ def only_text(text: str):
return re.sub(r"[,。、《》…()·・\s]", "", text)


def update_file_data(old_data: list, new_data: list):
for i in range(len(old_data)):
old_text = only_text("".join(old_data[i]["paragraphs"]))
new_text = only_text("".join(new_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
old_data[i]["author"] = new_data[start + i]["author"]
old_data[i]["paragraphs"] = new_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出,不更新
logging.warning(old_text)
logging.warning(new_text)
else:
old_data[i]["author"] = new_data[start + i]["author"]


char_dict = {
"鵷": "鹓",
"颭": "飐",
"鷁": "鹢",
"鴞": "鸮",
"餖": "饾",
"飣": "饤",
"舃": "舄",
"駸": "骎",
"薄倖": "薄幸",
"赬": "赪",
"鷫鸘": "鹔鹴",
"嶮": "崄",
"後": "后",
"纇": "颣",
"颸": "飔",
"崑崙": "昆仑",
"曨": "昽"
}


def correct(old_data: list):
""" 部分繁体转为简体 """
for i in range(len(old_data)):
for j in range(len(old_data[i]["paragraphs"])):
for k, v in char_dict.items():
if k in old_data[i]["paragraphs"][j]:
old_data[i]["paragraphs"][j] = old_data[i]["paragraphs"][j].replace(k, v)


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(levelname)-9s %(filename)-15s[:%(lineno)d]\t%(message)s")
Expand All @@ -73,31 +121,16 @@ def only_text(text: str):
# 读取临时文件
with open("all.json", "r", encoding="utf-8") as f:
all_data = json.load(f)
# 统计更正的数目
diff_num = 0
# 遍历当前目录
for file_name in os.listdir("./"):
if re.match(r"ci\.song\.\d+\.json", file_name):
# 每个文件开始的数据索引
start = int(file_name.split(".")[2])
with open(file_name, "r", encoding="utf-8") as f:
file_data = json.load(f)
for i in range(len(file_data)):
old_text = only_text("".join(file_data[i]["paragraphs"]))
new_text = only_text("".join(all_data[start + i]["paragraphs"]))
# 计算纯文字的相似度
ratio = SequenceMatcher(a=old_text, b=new_text).quick_ratio()
if 0.9 <= ratio < 1.0:
# 假定此范围内说明缺字,需要更新
diff_num += 1
file_data[i]["author"] = all_data[start + i]["author"]
file_data[i]["paragraphs"] = all_data[start + i]["paragraphs"]
elif ratio < 0.9:
# 异常情况warning输出,不更新
logging.warning(old_text)
logging.warning(new_text)
update_file_data(file_data, all_data)
correct(file_data)
# 保存数据,原文件中逗号后有空格,这里保持一致
with open(file_name, "w", encoding="utf-8") as f:
f.write(json.dumps(file_data, indent=2, ensure_ascii=False).replace(",", ", "))
logging.info("Save " + file_name)
logging.info("Change {0} items".format(diff_num))
Loading

0 comments on commit 88ac4a5

Please sign in to comment.