forked from mozilla/gecko-dev
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_tlds.py
149 lines (126 loc) · 4.06 KB
/
prepare_tlds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import codecs
import encodings.idna
import imp
import os
import re
import sys
from make_dafsa import words_to_cxx, words_to_bin
"""
Processes a file containing effective TLD data. See the following URL for a
description of effective TLDs and of the file format that this script
processes (although for the latter you're better off just reading this file's
short source code).
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
"""
def getEffectiveTLDs(path):
file = codecs.open(path, "r", "UTF-8")
entries = []
domains = set()
for line in file:
# line always contains a line terminator unless the file is empty
if len(line) == 0:
raise StopIteration
line = line.rstrip()
# comment, empty, or superfluous line for explicitness purposes
if line.startswith("//") or "." not in line:
continue
line = re.split(r"[ \t\n]", line, 1)[0]
entry = EffectiveTLDEntry(line)
domain = entry.domain()
assert domain not in domains, \
"repeating domain %s makes no sense" % domain
domains.add(domain)
yield entry
def _normalizeHostname(domain):
"""
Normalizes the given domain, component by component. ASCII components are
lowercased, while non-ASCII components are processed using the ToASCII
algorithm.
"""
def convertLabel(label):
if _isASCII(label):
return label.lower()
return encodings.idna.ToASCII(label).decode("utf-8")
return ".".join(map(convertLabel, domain.split(".")))
def _isASCII(s):
"True if s consists entirely of ASCII characters, false otherwise."
for c in s:
if ord(c) > 127:
return False
return True
class EffectiveTLDEntry:
"""
Stores an entry in an effective-TLD name file.
"""
_exception = False
_wild = False
def __init__(self, line):
"""
Creates a TLD entry from a line of data, which must have been stripped of
the line ending.
"""
if line.startswith("!"):
self._exception = True
domain = line[1:]
elif line.startswith("*."):
self._wild = True
domain = line[2:]
else:
domain = line
self._domain = _normalizeHostname(domain)
def domain(self):
"The domain this represents."
return self._domain
def exception(self):
"True if this entry's domain denotes does not denote an effective TLD."
return self._exception
def wild(self):
"True if this entry represents a class of effective TLDs."
return self._wild
#################
# DO EVERYTHING #
#################
def main(output, effective_tld_filename, output_format="cxx"):
"""
effective_tld_filename is the effective TLD file to parse.
based on the output format, either a C++ array of a binary representation
of a DAFSA representing the eTLD file is then printed to standard output
or a binary file is written to disk.
"""
def typeEnum(etld):
"""
Maps the flags to the DAFSA's enum types.
"""
if etld.exception():
return 1
elif etld.wild():
return 2
else:
return 0
def dafsa_words():
"""
make_dafsa expects lines of the form "<domain_name><enum_value>"
"""
for etld in getEffectiveTLDs(effective_tld_filename):
yield "%s%d" % (etld.domain(), typeEnum(etld))
""" words_to_bin() returns a bytes while words_to_cxx() returns string """
if output_format == "bin":
if sys.version_info[0] >= 3:
output = output.buffer
output.write(words_to_bin(dafsa_words()))
else:
output.write(words_to_cxx(dafsa_words()))
if __name__ == '__main__':
"""
This program can output the DAFSA in two formats:
as C++ code that will be included and compiled at build time
or as a binary file that will be published in Remote Settings.
Flags for format options:
"cxx" -> C++ array [default]
"bin" -> Binary file
"""
output_format = "bin" if "--bin" in sys.argv else "cxx"
main(sys.stdout, sys.argv[1], output_format=output_format)