forked from w3c/csswg-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml2xhtml.py
executable file
·77 lines (62 loc) · 2.01 KB
/
html2xhtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python
# This file is licensed under CC Zero
import sys
import html5lib
import re
if len(sys.argv) != 3:
print """! html2xhtml requires two arguments: the filename to read, and the filename to write"""
exit()
#######################################################################
# Parse HTML and output XHTML
f = open(sys.argv[1])
p = html5lib.HTMLParser()
t = p.parse(f)
o = html5lib.serializer.serialize(t, format='xhtml')
f.close()
#######################################################################
# Clean up the mess left by html5lib
def firstMatch(m): # Python makes s/x(y+)?/z$1/ very difficult
if m.group(1):
return m.group(1)
return ''
# Missing XHTML artifacts
o = re.sub('<!DOCTYPE [^>]+>',
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
o);
o = re.sub('<html( [^>]+)?>',
lambda m : '<html' + firstMatch(m) + ' xmlns="http://www.w3.org/1999/xhtml">',
o);
# Fix weird reordering
o = re.sub('<link href="(.*?)" (.*?) ?/>',
lambda m : '<link ' + m.group(2) + ' href="' + m.group(1) + '"/>',
o);
# Indentation
o = re.sub('<!DOCTYPE ([^>]+)><html',
lambda m : '<!DOCTYPE ' + firstMatch(m) + '>\n<html',
o);
o = re.sub('<html( [^>]+)?><',
lambda m : '<html' + firstMatch(m) + '>\n<',
o);
o = re.sub('<head( [^>]+)?><',
lambda m : '<head' + firstMatch(m) + '>\n<',
o);
o = re.sub('</head><',
'</head>\n<',
o);
o = re.sub('<body( [^>]+)?><',
lambda m : '<body' + firstMatch(m) + '>\n<',
o);
o = re.sub('</body><',
'</body>\n<',
o);
o = re.sub('</html>$',
'</html>\n',
o);
o = re.sub('\xa0',
' ',
o); # make nbsp visible to people viewing source
#######################################################################
# Write to file
f = open(sys.argv[2], 'w')
f.write(o.encode('utf-8'))
f.close()