forked from Show-Me-the-Code/python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_text.py
38 lines (32 loc) · 1.38 KB
/
find_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#! usr/bin/python3
"""
第 0008 题:一个HTML文件,找出里面的正文。
"""
import re, urllib.request, time
url = input('Enter the URL which you wish to extract > ')
if '' == url:
url = "https://adblockplus.org/zh_CN/acceptable-ads"
print('We will extract text data from ' + url + ' :')
with urllib.request.urlopen(url) as response:
content = response.read()
try:
content = content.decode('utf-8')
except UnicodeDecodeError:
content = content.decode('gbk')
#remove some special content
content = re.sub(r'<title.+?</title>', '', content, flags = re.DOTALL)
content = re.sub(r'<head(er)?.+?</head(er)?>', '',
content, flags = re.DOTALL)
content = re.sub(r'<(no)?script.+?</(no)?script>', '',
content, flags = re.DOTALL)
content = re.sub(r'<style.+?</style>', '', content, flags = re.DOTALL)
content = re.sub(r'<form.+?</form>', '', content, flags = re.DOTALL)
content = re.sub(r'<footer.+?</footer>', '', content, flags = re.DOTALL)
#find text and fill them in a list
result = re.findall(r'(?<=>)[^><]+?(?=<)', content)
text = ''.join(result).strip()
file_name = time.strftime('%Y%m%d%H%M%S') + '.txt'
with open(file_name, 'wt') as textfile:
#textfile.write(text)
print(text, file = textfile)
print('Extract finished, the text file is: ' + file_name)