forked from F5Soft/RubbishMaoGai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtiku.py
113 lines (92 loc) · 3.19 KB
/
tiku.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
题库生成
Author: F5
Date: 2020-05-12
"""
import matplotlib.pyplot as plt
import os
import sys
import html
if len(sys.argv) != 2:
print("Usage: python tiku.py [my|mg|sg]")
exit(0)
course = sys.argv[1]
data_path = "samples_" + course
tiku_path = "tiku_" + course + ".html"
prepend = '''
<head>
<title>题库</title>
<meta charset="utf-8">
</head>
<body>
<div>全文搜索时,由于标点符号全半角、空格数量不匹配等问题,不建议复制整个标题搜索,可能会找不到结果!建议仅搜索题目标题的连续文字内容。</div>
<h5>GitHub仓库地址:<a href="https://github.com/F5Soft/RubbishMaoGai">https://github.com/F5Soft/RubbishMaoGai</a>。</h5>
<h5>如果题库使用体验不错,欢迎star!</h5>
<hr>
'''
def unescape(raw: str):
"""
转换十进制Unicode表示的字符
"""
ans = raw.strip()
ans = html.unescape(ans)
ans = html.unescape(ans)
ans = ans.replace("<p>", '')
ans = ans.replace("</p>", '')
ans = ans.replace("<br>", '')
ans = ans.replace(' ', '')
ans = ans.lstrip('>')
return ans
tiku = open(tiku_path, 'wt', encoding="utf8", errors="replace")
tiku.write(prepend)
questions = set()
question_count = 0
question_count_list = []
total_count = 0
total_count_list = []
for dirname in os.listdir(data_path):
for filename in os.listdir(os.path.join(data_path, dirname)):
if "_files" in filename:
continue
filename = os.path.join(data_path, dirname, filename)
f = open(filename, 'rt', encoding='GBK')
in_context = False
in_answer_context = False
for line in f.readlines():
# 题目标题
if "_content" in line:
total_count += 1
low = line.find('value=') + 7
high = line.find('><iframe') - 1
title = unescape(line[low:high])
if title not in questions:
in_context = True
questions.add(title)
question_count += 1
tiku.write("<h3>" + str(question_count) +
". " + title + "</h3>")
question_count_list.append(question_count)
total_count_list.append(total_count)
# 题目选项
if in_context and "answer" in line and "sogoutip" not in line:
low = line.find('answer') + 8
tiku.write("<div>" + unescape(line[low:]) + "</div>")
# 题目答案区域结束
if in_answer_context and "</td>" in line:
in_context = False
in_answer_context = False
tiku.write("</div>")
# 题目答案选项内容(有多选情况)
if in_answer_context:
tiku.write("<div>" + unescape(line) + "</div>")
# 题目答案区域开始
if in_context and "[参考答案]" in line:
in_answer_context = True
tiku.write("<h5>参考答案</h5><div>")
f.close()
tiku.close()
print("去重后:", question_count)
print("总计:", total_count)
# 预测题库题目数量
plt.plot(total_count_list, question_count_list)
plt.show()