Source code

Revision control

Other Tools

1
# This Source Code Form is subject to the terms of the Mozilla Public
2
# License, v. 2.0. If a copy of the MPL was not distributed with this
3
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5
import codecs
6
import encodings.idna
7
import imp
8
import os
9
import re
10
import sys
11
from make_dafsa import words_to_cxx, words_to_bin
12
13
"""
14
Processes a file containing effective TLD data. See the following URL for a
15
description of effective TLDs and of the file format that this script
16
processes (although for the latter you're better off just reading this file's
17
short source code).
18
20
"""
21
22
def getEffectiveTLDs(path):
23
file = codecs.open(path, "r", "UTF-8")
24
entries = []
25
domains = set()
26
for line in file:
27
# line always contains a line terminator unless the file is empty
28
if len(line) == 0:
29
raise StopIteration
30
line = line.rstrip()
31
# comment, empty, or superfluous line for explicitness purposes
32
if line.startswith("//") or "." not in line:
33
continue
34
line = re.split(r"[ \t\n]", line, 1)[0]
35
entry = EffectiveTLDEntry(line)
36
domain = entry.domain()
37
assert domain not in domains, \
38
"repeating domain %s makes no sense" % domain
39
domains.add(domain)
40
yield entry
41
42
def _normalizeHostname(domain):
43
"""
44
Normalizes the given domain, component by component. ASCII components are
45
lowercased, while non-ASCII components are processed using the ToASCII
46
algorithm.
47
"""
48
def convertLabel(label):
49
if _isASCII(label):
50
return label.lower()
51
return encodings.idna.ToASCII(label).decode("utf-8")
52
return ".".join(map(convertLabel, domain.split(".")))
53
54
def _isASCII(s):
55
"True if s consists entirely of ASCII characters, false otherwise."
56
for c in s:
57
if ord(c) > 127:
58
return False
59
return True
60
61
class EffectiveTLDEntry:
62
"""
63
Stores an entry in an effective-TLD name file.
64
"""
65
66
_exception = False
67
_wild = False
68
69
def __init__(self, line):
70
"""
71
Creates a TLD entry from a line of data, which must have been stripped of
72
the line ending.
73
"""
74
if line.startswith("!"):
75
self._exception = True
76
domain = line[1:]
77
elif line.startswith("*."):
78
self._wild = True
79
domain = line[2:]
80
else:
81
domain = line
82
self._domain = _normalizeHostname(domain)
83
84
def domain(self):
85
"The domain this represents."
86
return self._domain
87
88
def exception(self):
89
"True if this entry's domain denotes does not denote an effective TLD."
90
return self._exception
91
92
def wild(self):
93
"True if this entry represents a class of effective TLDs."
94
return self._wild
95
96
97
#################
98
# DO EVERYTHING #
99
#################
100
101
def main(output, effective_tld_filename, output_format="cxx"):
102
"""
103
effective_tld_filename is the effective TLD file to parse.
104
based on the output format, either a C++ array of a binary representation
105
of a DAFSA representing the eTLD file is then printed to standard output
106
or a binary file is written to disk.
107
"""
108
109
def typeEnum(etld):
110
"""
111
Maps the flags to the DAFSA's enum types.
112
"""
113
if etld.exception():
114
return 1
115
elif etld.wild():
116
return 2
117
else:
118
return 0
119
120
def dafsa_words():
121
"""
122
make_dafsa expects lines of the form "<domain_name><enum_value>"
123
"""
124
for etld in getEffectiveTLDs(effective_tld_filename):
125
yield "%s%d" % (etld.domain(), typeEnum(etld))
126
127
""" words_to_bin() returns a bytes while words_to_cxx() returns string """
128
if output_format == "bin":
129
if sys.version_info[0] >= 3:
130
output = output.buffer
131
output.write(words_to_bin(dafsa_words()))
132
else:
133
output.write(words_to_cxx(dafsa_words()))
134
135
136
137
if __name__ == '__main__':
138
"""
139
This program can output the DAFSA in two formats:
140
as C++ code that will be included and compiled at build time
141
or as a binary file that will be published in Remote Settings.
142
143
Flags for format options:
144
"cxx" -> C++ array [default]
145
"bin" -> Binary file
146
"""
147
148
output_format = "bin" if "--bin" in sys.argv else "cxx"
149
main(sys.stdout, sys.argv[1], output_format=output_format)