forked from justvanrossum/fontgoggles
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerateOTTagsModule.py
112 lines (100 loc) · 3.48 KB
/
generateOTTagsModule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import re
def parse(data):
start = data.find("<tbody>")
end = data.find("</tbody>")
data = data[start+7:end]
for chunk in re.findall(r"<tr>.+?</tr>", data, re.DOTALL):
fields = re.findall(r"<td>(.+?)</td>", chunk, re.DOTALL)
parsedFields = []
for field in fields:
m = re.search(r'href="(.+?)"', field)
if m is not None:
parsedFields.append(m.group(1))
tagParts = field.split("'")
if len(tagParts) >= 2:
parsedFields.append(tagParts[1])
else:
parsedFields.append(field)
if parsedFields:
yield parsedFields
def formatFeatures(data, baseURL):
print("features = {")
print(" # tag, friendly name, documentation URL")
for link, tag, friendlyName in data:
if tag == 'cv01':
tags = [f"cv{i:02d}" for i in range(1, 100)]
else:
tags = [tag]
for tag in tags:
print(f" {tag!r}: ({friendlyName!r}, {baseURL+link!r}),")
print("}")
def formatScripts(data):
print("scripts = {")
print(" # tag, friendly name")
duplicates = {}
for i, (friendlyName, tag) in enumerate(data):
if tag in duplicates:
duplicates[tag] = duplicates[tag] + ", " + friendlyName
data[i] = (None, None) # skip
else:
duplicates[tag] = friendlyName
for _, tag in data:
if tag is None:
continue
friendlyName = duplicates[tag]
print(f" {tag!r}: {friendlyName!r},")
print("}")
def formatLanguages(data):
print("languages = {")
print(" # tag, friendly name, ISO 639 IDs (if applicable)")
for friendlyName, *fields in data:
tag = fields[0]
if len(tag) < 4:
tag += (4 - len(tag)) * " "
assert len(tag) == 4, tag
if len(fields) > 1:
assert len(fields) == 2
isoCodes = [isoCode.strip() for isoCode in fields[1].split(",")]
else:
isoCodes = []
t = (friendlyName,) + tuple(isoCodes)
print(f" {tag!r}: {t},")
print("}")
# https://docs.microsoft.com/en-us/typography/opentype/spec/featurelist
# https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
# https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
if __name__ == "__main__":
import sys
import time
baseURL = "https://docs.microsoft.com/en-us/typography/opentype/spec/"
if len(sys.argv) > 1:
with open(sys.argv[1]) as f:
html = f.read()
pages = [html]
else:
import urllib.request
pages = []
print(f"# Generated by {os.path.basename(__file__)}")
print("# Scraped from:")
for page in ["featurelist", "scripttags", "languagetags"]:
url = baseURL + page
print(f"# {url}")
with urllib.request.urlopen(url) as fp:
html = fp.read().decode("utf-8", errors="replace")
pages.append(html)
print()
print()
print("__all__ = ['features', 'scripts', 'languages']")
print()
for html in pages:
print()
parsed = list(parse(html))
if "<title>Registered features" in html:
formatFeatures(parsed, baseURL)
elif "<title>Script tags" in html:
formatScripts(parsed)
elif "<title>Language system tags" in html:
formatLanguages(parsed)
else:
assert 0, "huh."