Source code

Revision control

Copy as Markdown

Other Tools

#!/usr/bin/env python3
""" Generate Unicode data table for parser
"""
import argparse
import io
import re
import sys
from contextlib import closing
from itertools import tee, zip_longest
from urllib.request import urlopen
from zipfile import ZipFile
# These are also part of IdentifierPart ยง11.6 Names and Keywords
compatibility_identifier_part = [
ord(u'\N{ZERO WIDTH NON-JOINER}'),
ord(u'\N{ZERO WIDTH JOINER}'),
]
FLAG_ID_START = 1 << 0
FLAG_ID_CONTINUE = 1 << 1
def download_derived_core_properties(version):
"""Downloads UCD.zip for given version, and return the content of
DerivedCoreProperties.txt. """
if version == 'UNIDATA':
url = '%s/%s' % (baseurl, version)
else:
url = '%s/%s/ucd' % (baseurl, version)
request_url = '{}/UCD.zip'.format(url)
with closing(urlopen(request_url)) as downloaded_file:
downloaded_data = io.BytesIO(downloaded_file.read())
with ZipFile(downloaded_data) as zip_file:
return zip_file.read('DerivedCoreProperties.txt').decode()
def read_derived_core_properties(derived_core_properties):
"""Read DerivedCoreProperties.txt content and yield each item. """
for line in derived_core_properties.split('\n'):
if line == '' or line.startswith('#'):
continue
row = line.split('#')[0].split(';')
char_range = row[0].strip()
char_property = row[1].strip()
if '..' not in char_range:
yield (int(char_range, 16), char_property)
else:
[start, end] = char_range.split('..')
for char in range(int(start, 16), int(end, 16) + 1):
yield (char, char_property)
def process_derived_core_properties(derived_core_properties):
"""Parse DerivedCoreProperties.txt and returns its version,
and set of characters with ID_Start and ID_Continue. """
id_start = set()
id_continue = set()
m = re.match('# DerivedCoreProperties-([0-9\.]+).txt', derived_core_properties)
assert m
version = m.group(1)
for (char, prop) in read_derived_core_properties(derived_core_properties):
if prop == 'ID_Start':
id_start.add(char)
if prop == 'ID_Continue':
id_continue.add(char)
return (version, id_start, id_continue)
def int_ranges(ints):
""" Yields consecutive ranges (inclusive) from integer values. """
(a, b) = tee(sorted(ints))
start = next(b)
for (curr, succ) in zip_longest(a, b):
if curr + 1 != succ:
yield (start, curr)
start = succ
def process_unicode_data(derived_core_properties):
MAX_BMP = 0xffff
dummy = 0
table = [dummy]
cache = {dummy: 0}
index = [0] * (MAX_BMP + 1)
non_bmp_id_start_set = {}
non_bmp_id_continue_set = {}
(version, id_start, id_continue) = process_derived_core_properties(derived_core_properties)
codes = id_start.union(id_continue)
for code in codes:
if code > MAX_BMP:
if code in id_start:
non_bmp_id_start_set[code] = 1
if code in id_continue:
non_bmp_id_continue_set[code] = 1
continue
flags = 0
if code in id_start:
flags |= FLAG_ID_START
if code in id_continue or code in compatibility_identifier_part:
flags |= FLAG_ID_CONTINUE
i = cache.get(flags)
if i is None:
assert flags not in table
cache[flags] = i = len(table)
table.append(flags)
index[code] = i
return (
version,
table,
index,
id_start,
id_continue,
non_bmp_id_start_set,
non_bmp_id_continue_set,
)
def getsize(data):
""" return smallest possible integer size for the given array """
maxdata = max(data)
assert maxdata < 2**32
if maxdata < 256:
return 1
elif maxdata < 65536:
return 2
else:
return 4
def splitbins(t):
"""t -> (t1, t2, shift). Split a table to save space.
t is a sequence of ints. This function can be useful to save space if
many of the ints are the same. t1 and t2 are lists of ints, and shift
is an int, chosen to minimize the combined size of t1 and t2 (in C
code), and where for each i in range(len(t)),
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
"""
def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % (
len(t1), len(t2), shift, bytes), file=sys.stderr)
print("Size of original table:", len(t) * getsize(t),
"bytes", file=sys.stderr)
n = len(t)-1 # last valid index
maxshift = 0 # the most we can shift n and still have something left
if n > 0:
while n >> 1:
n >>= 1
maxshift += 1
del n
bytes = sys.maxsize # smallest total size so far
t = tuple(t) # so slices can be dict keys
for shift in range(maxshift + 1):
t1 = []
t2 = []
size = 2**shift
bincache = {}
for i in range(0, len(t), size):
bin = t[i:i + size]
index = bincache.get(bin)
if index is None:
index = len(t2)
bincache[bin] = index
t2.extend(bin)
t1.append(index >> shift)
# determine memory size
b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
if b < bytes:
best = t1, t2, shift
bytes = b
t1, t2, shift = best
print("Best:", end=' ', file=sys.stderr)
dump(t1, t2, shift, bytes)
# exhaustively verify that the decomposition is correct
mask = 2**shift - 1
for i in range(len(t)):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
def write_table(f, name, type, table, formatter, per_line):
f.write(f"""
pub const {name}: &'static [{type}] = &[
""")
i = 0
for item in table:
if i == 0:
f.write(' ')
f.write(f'{formatter(item)},')
i += 1
if i == per_line:
i = 0
f.write("""
""")
f.write("""\
];
""")
def write_func(f, name, group_set):
f.write(f"""
pub fn {name}(c: char) -> bool {{""")
for (from_code, to_code) in int_ranges(group_set.keys()):
f.write(f"""
if c >= \'\\u{{{from_code:X}}}\' && c <= \'\\u{{{to_code:X}}}\' {{
return true;
}}""")
f.write("""
false
}
""")
def make_unicode_file(version, table, index,
id_start, id_continue,
non_bmp_id_start_set, non_bmp_id_continue_set):
index1, index2, shift = splitbins(index)
# verify correctness
for char in index:
test = table[index[char]]
idx = index1[char >> shift]
idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
assert test == table[idx]
with open('crates/parser/src/unicode_data.rs', 'w') as f:
f.write(f"""\
// Generated by update_unicode.py DO NOT MODIFY
// Unicode version: {version}
""")
f.write(f"""
const FLAG_ID_START: u8 = {FLAG_ID_START};
const FLAG_ID_CONTINUE: u8 = {FLAG_ID_CONTINUE};
""")
f.write("""
pub struct CharInfo {
flags: u8,
}
impl CharInfo {
pub fn is_id_start(&self) -> bool {
self.flags & FLAG_ID_START != 0
}
pub fn is_id_continue(&self) -> bool {
self.flags & FLAG_ID_CONTINUE != 0
}
}
""")
write_table(f, 'CHAR_INFO_TABLE', 'CharInfo', table,
lambda flag: f"CharInfo {{ flags: {flag} }}",
1)
write_table(f, 'INDEX1', 'u8', index1,
lambda i: f'{i:4d}', 8)
write_table(f, 'INDEX2', 'u8', index2,
lambda i: f'{i:4d}', 8)
f.write(f"""
const SHIFT: usize = {shift};
""")
f.write("""
pub fn char_info(c: char) -> &'static CharInfo {
let code = c as usize;
let index = INDEX1[code >> SHIFT] as usize;
let index = INDEX2[(index << SHIFT) + (code & ((1 << SHIFT) - 1))] as usize;
&CHAR_INFO_TABLE[index]
}
""")
def format_bool(b):
if b:
return 'true '
else:
return 'false'
write_table(f, 'IS_ID_START_TABLE', 'bool', range(0, 128),
lambda code: format_bool(code in id_start), 8)
write_table(f, 'IS_ID_CONTINUE_TABLE', 'bool', range(0, 128),
lambda code: format_bool(code in id_continue), 8)
write_func(f, 'is_id_start_non_bmp', non_bmp_id_start_set)
write_func(f, 'is_id_continue_non_bmp', non_bmp_id_continue_set)
parser = argparse.ArgumentParser(description='Generate Unicode data table for parser')
parser.add_argument('VERSION',
help='Unicode version number to download from\
<https://unicode.org/Public>. The number must match\
a published Unicode version, e.g. use\
"--version=8.0.0" to download Unicode 8 files. Alternatively use\
"--version=UNIDATA" to download the latest published version.')
parser.add_argument('PATH_TO_JSPARAGUS',
help='Path to jsparagus')
args = parser.parse_args()
derived_core_properties = download_derived_core_properties(args.VERSION)
(
version,
table,
index,
id_start,
id_continue,
non_bmp_id_start_set,
non_bmp_id_continue_set,
) = process_unicode_data(derived_core_properties)
make_unicode_file(
version,
table,
index,
id_start,
id_continue,
non_bmp_id_start_set,
non_bmp_id_continue_set,
)