-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_unescape.py
62 lines (55 loc) · 1.78 KB
/
html_unescape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
'''
Created on Apr 22, 2012
@author: lordzeus
'''
import sys
import re
PYTHON_VERSION = list(sys.version_info)[0]
if PYTHON_VERSION <= 2:
import htmlentitydefs
else:
import html.entities as htmlentitydefs
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
if PYTHON_VERSION <= 2:
return unichr(int(text[3:-1], 16))
else:
return chr(int(text[3:-1], 16))
else:
if PYTHON_VERSION <= 2:
return unichr(int(text[2:-1]))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
if PYTHON_VERSION <= 2:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
else:
text = chr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
if __name__ == '__main__':
links = open("html/about.html", "rt")
pagina_link = links.read()
paginas = re.findall(r'<a.*?href=[\'"]([^#].*?)[\'"].*?>(.*?)</a>', pagina_link, flags=re.IGNORECASE)
print(len(paginas))
name = ""
for link in paginas:
name = re.sub("<.*?>", "", link[1], flags=re.IGNORECASE)
name = unescape(name)
print(link[0], " - ", name)