extract_es_grammar.py

mozilla-central/third_party/rust/jsparagus/js_parser/extract_es_grammar.py

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

"""extract_es_grammar.py - Extract the grammar from the ECMAScript spec

To run this script, you first need to get the source of the version of

the ECMAScript spec you're interested in.

    cd ../..

    mkdir tc39

    cd tc39

    git clone git@github.com:tc39/ecma262.git

Then:

    make js_parser/es.esgrammar

You can also use this script on a random HTTPS URL, like:

    URL=https://raw.githubusercontent.com/tc39/proposal-class-fields/master/spec.html

    python extract_esgrammar.py $URL

"""

import argparse

import urllib

import html5lib  # type: ignore

import re

from textwrap import dedent

HTML = "{http://www.w3.org/1999/xhtml}"

INS_TAG = HTML + "ins"

DEL_TAG = HTML + "del"

INS = '+'

DEL = '-'

KEEP = ' '

def pre_with_code_filter_factory(e):

    """Checks if the <pre> is used in the following pattern:

```

    <pre><code>

    </code></pre>

```

    If so, return a filter that formats the content, removing extra spaces.

    line-wrap, and backquote added by <code>.

"""

    if e.text and e.text.strip() != '':

        return False

    if len(e) != 1:

        return False

    if e[0].tag != '{http://www.w3.org/1999/xhtml}code':

        return False

    if e[0].tail and e[0].tail.strip() != '':

        return False

    def children_filter(texts):

        while len(texts) > 0 and texts[0].strip() == '':

            texts.pop(0)

        if len(texts) > 0 and texts[0].strip() == '`':

            texts.pop(0)

        while len(texts) > 0 and texts[0].strip() == '':

            texts.pop(0)

        while len(texts) > 0 and texts[-1].strip() == '':

            texts.pop()

        if len(texts) > 0 and texts[-1].strip() == '`':

            texts.pop()

        while len(texts) > 0 and texts[-1].strip() == '':

            texts.pop()

        is_first = True

        for text in texts:

            for line in text.split('\n'):

                line = line.strip()

                if line == '':

                    continue

                if not is_first:

                    yield '\n'

                is_first = False

                yield line

    return children_filter

# Rules for extracting text, used by extracting Early Errors.

EXTRA_RULES_FOR_EE = {

    'b': {},

    'br': {},

    'code': {

        'prefix': '`',

        'postfix': '`',

},

    'emu-alg': {},

    'emu-grammar': {},

    'emu-note': {

        'prefix': ['NOTE', '\n'],

        'strip': True,

},

    'emu-xref': {

        'prefix': lambda e: e.attrib.get('href'),

},

    'ins': {

        'ignore_highlighted': True,

},

    'p': {

        'strip': True,

},

    'pre': {

        'prefix': ['\n', '\n', '```', '\n'],

        'postfix': ['\n', '```', '\n', '\n'],

        'strip': True,

        'children_filter_factroy': pre_with_code_filter_factory,

},

    'sub': {

        'prefix': '_',

},

    'sup': {

        'prefix': '^',

},

def apply_prefix_postfix_rule(e, rule, name):

    """If rule is provided, apply prefix/postfix rule to the element `e`.

"""

    if not rule:

        return

    fix = rule.get(name)

    if callable(fix):

        yield fix(e)

    elif isinstance(fix, list):

        for item in fix:

            yield item

    elif fix:

        yield fix

def apply_strip_rule(text, rule):

    """If rule is provided, apply strip rule to the text.

"""

    if not text:

        return

    if not rule:

        yield text

        return

    strip = rule.get('strip')

    if strip:

        yield text.strip()

    else:

        yield text

def fragment_child_chunks(e, extra_rules={}):

    """Partly interpret the content of `e`, yielding `text`,

    applying extra_rules.

    Concatenating the yielded `text` values gives the full text of `e`.

"""

    rule = extra_rules[e.tag.replace(HTML, '')]

    children_filter = None

    factroy = rule.get('children_filter_factroy')

    if factroy:

        children_filter = factroy(e)

    yield from apply_prefix_postfix_rule(e, rule, 'prefix')

    yield from apply_strip_rule(e.text, rule)

    for child in e:

        if child.tag.replace(HTML, '') not in extra_rules:

            raise ValueError("unrecognized element: " + child.tag)

        texts = []

        for text in fragment_child_chunks(child, extra_rules):

            if children_filter:

                texts.append(text)

            else:

                yield text

        if children_filter:

            for text in children_filter(texts):

                yield text

    yield from apply_strip_rule(e.tail, rule)

    yield from apply_prefix_postfix_rule(e, rule, 'postfix')

def is_highlighted_ins(e):

    """Returns True if e matches the following pattern:

      <ins>highlighted</ins> text:

    See `fragment_chunks` comment for the details

"""

    if len(e) != 0:

        return False

    if not e.text:

        return False

    if e.text != 'highlighted':

        return False

    if not e.tail:

        return False

    if not e.tail.startswith(' text:'):

        return False

    return True

def is_negligible_ins(e, extra_rules):

    """Returns True if the 'ignore_highlighted' rule is defined for <ins>,

    and it matches to the negligible pattern.

    See `fragment_chunks` comment for the details

"""

    rule = extra_rules.get(e.tag.replace(HTML, ''))

    if not rule:

        return False

    if rule.get('ignore_highlighted'):

        if is_highlighted_ins(e):

            return True

    return False

def fragment_chunks(e, extra_rules={}):

    """Partly interpret the content of `e`, yielding pairs (ty, text).

    If `extra_rules` isn't provided, the content of `e` must be text with 0

    or more <ins>/<del> elements.

    The goal is to turn the tree `e` into a simple series of tagged strings.

    Yields pairs (ty, text) where ty in (INS, DEL, KEEP). Concatenating the

    yielded `text` values gives the full text of `e`.

    `extra_rules` is a dictionary that defines extra elements that is allowed

    as the content of `e`.

    Each item defines a rule for the tag, with the following:

      * prefix

        Put a prefix before the text

        Possible values:

          * string

          * list of string

          * function

            receives `Element` and returns a prefix string

      * postfix

        Put a postfix after the text

        value uses the same format as prefix

      * strip

        True to strip whitespaces before/after element's text

      * children_filter_factroy

        A function that receives `Element`, and returns a filter function or None

        The filter function receives a list of texts for child nodes, and

        returns a list of filtered text

      * ignore_highlighted

        Effective only with <ins>

        Do not treat <ins> as an insertion if it matches the following pattern:

          <ins>highlighted</ins> text:

        This pattern is used in Annex B description.

"""

    rule = extra_rules.get(e.tag.replace(HTML, ''))

    for text in apply_prefix_postfix_rule(e, rule, 'prefix'):

        yield KEEP, text

    for text in apply_strip_rule(e.text, rule):

        yield KEEP, text

    for child in e:

        if child.tag == INS_TAG and not is_negligible_ins(child, extra_rules):

            ty = INS

        elif child.tag == DEL_TAG:

            ty = DEL

        else:

            if child.tag.replace(HTML, '') not in extra_rules:

                raise ValueError("unrecognized element: " + child.tag)

            for text in fragment_child_chunks(child, extra_rules):

                yield KEEP, text

            continue

        if child.text:

            yield ty, child.text

        if len(child) != 0:

            for grandchild in child:

                if grandchild.tag.replace(HTML, '') not in extra_rules:

                    raise ValueError("unsupported nested element {} in {}"

                                     .format(grandchild.tag, child.tag))

                for text in fragment_child_chunks(grandchild, extra_rules):

                    yield ty, text

        if child.tail:

            yield KEEP, child.tail

    for text in apply_strip_rule(e.tail, rule):

        yield KEEP, text

    for text in apply_prefix_postfix_rule(e, rule, 'postfix'):

        yield KEEP, text

def fragment_parts(e, **kwargs):

    """Like fragment_chunks, but with two fixups.

    1.  Break up pairs that include both a newline and any other text.

    2.  Move newlines inside of a preceding INS or DEL element that spans its

        whole line.

"""

    line_has_ins = False

    line_has_del = False

    for chunk_ty, text in fragment_chunks(e, **kwargs):

        for piece in re.split(r'(\n)', text):

            ty = chunk_ty

            if piece != '':

                if piece == '\n':

                    # Possibly move newline inside preceding INS or DEL.

                    if line_has_ins and not line_has_del:

                        ty = INS

                    elif line_has_del and not line_has_ins:

                        ty = DEL

                    else:

                        ty = KEEP

                    line_has_del = False

                    line_has_ins = False

                elif piece.strip() != '':

                    if ty in (INS, KEEP):

                        line_has_ins = True

                    if ty in (DEL, KEEP):

                        line_has_del = True

                yield ty, piece

def generate_fragment_patch(e, **kwargs):

    line_before = ''

    line_after = ''

    def end_line(ty):

        nonlocal line_before, line_after

        if line_before.rstrip() == line_after.rstrip():

            yield " ", line_after

        else:

            if line_before.strip() != '' or ty != INS:

                yield "-", line_before

            if line_after.strip() != '' or ty != DEL:

                yield "+", line_after

        line_before = ''

        line_after = ''

    for ty, text in fragment_parts(e, **kwargs):

        if text == '\n':

            yield from end_line(ty)

        else:

            if ty in (KEEP, DEL):

                line_before += text

            if ty in (KEEP, INS):

                line_after += text

    if line_before or line_after:

        yield from end_line(KEEP)

def dedent_pairs(pairs):

    """Dedent the `pairs`'s `text` part

"""

    pairs = list(pairs)

    # Using textwrap.dedent on this requires a few lines of hackery.

    types = [ty for ty, _line in pairs]

    dedented_lines = dedent(''.join(line + '\n' for ty, line in pairs)).splitlines()

    assert len(dedented_lines) == len(pairs)

    return zip(types, dedented_lines)

def print_pairs(pairs):

    last_line_was_empty = False

    for ty, line in pairs:

        if ty == KEEP and line == '':

            if last_line_was_empty:

                continue

            last_line_was_empty = True

        else:

            last_line_was_empty = False

        print(ty + line)

def print_fragment_patch(e):

    print_pairs(dedent_pairs(generate_fragment_patch(e)))

def is_annex_early_errors(e):

    """Returns True if the <emu-annex> element contains Early Errors.

"""

    h1 = e.find('{http://www.w3.org/1999/xhtml}h1')

    if 'Early Errors' in h1.text:

        return True

    p = e.find('{http://www.w3.org/1999/xhtml}p')

    if p:

        if 'Early Error' in html5lib.serializer.serialize(p):

            return True

    return False

def get_parent_map(document):

    """Returns a map from a element to parent element.

    This is necessary because `xml.etree.ElementTree.Element` doesn't have

    a reference to parent element.

"""

    parent_map = dict()

    for parent in document.iter():

        for child in parent:

            parent_map[child] = parent

    return parent_map

def get_titles(parent_map, e):

    """Returns a list of section titles for a section.

"""

    titles = []

    while e.tag != '{http://www.w3.org/1999/xhtml}body':

        h1 = e.find('{http://www.w3.org/1999/xhtml}h1')

        titles.insert(0, h1.text)

        e = parent_map[e]

    return titles

def generate_ul_fragment_patch(e, depth):

    """Similar to generate_fragment_patch, but for <ul>

"""

    first_line_prefix = '{}* '.format('  ' * depth)

    other_line_prefix = '{}  '.format('  ' * depth)

    for item in e:

        if item.tag != '{http://www.w3.org/1999/xhtml}li':

            raise ValueError("unrecognized element: " + item.tag)

        pairs = generate_fragment_patch(item,

                                        extra_rules=EXTRA_RULES_FOR_EE)

        is_first_line = True

        for ty, line in dedent_pairs(pairs):

            if is_first_line and line.strip() == '':

                continue

            if is_first_line:

                is_first_line = False

                yield ty, '{}{}'.format(first_line_prefix, line.strip())

            else:

                yield ty, '{}{}'.format(other_line_prefix, line.strip())

def generate_early_errors_fragment_patch(parent_map, e):

    for t in get_titles(parent_map, e):

        yield KEEP, '# {}'.format(t)

    yield KEEP, '# #{}'.format(e.attrib.get('id'))

    yield KEEP, ''

    for child in e:

        if child.tag == '{http://www.w3.org/1999/xhtml}h1':

            continue

        if child.tag == '{http://www.w3.org/1999/xhtml}emu-grammar':

            pairs = generate_fragment_patch(child)

            yield from dedent_pairs(pairs)

            yield KEEP, ''

        elif child.tag == '{http://www.w3.org/1999/xhtml}ul':

            yield from generate_ul_fragment_patch(child, 0)

        elif child.tag == '{http://www.w3.org/1999/xhtml}emu-note':

            pairs = generate_fragment_patch(child,

                                            extra_rules=EXTRA_RULES_FOR_EE)

            yield from dedent_pairs(pairs)

            yield KEEP, ''

        elif child.tag == '{http://www.w3.org/1999/xhtml}p':

            pairs = generate_fragment_patch(child,

                                            extra_rules=EXTRA_RULES_FOR_EE)

            yield from dedent_pairs(pairs)

            yield KEEP, ''

        elif (child.tag == '{http://www.w3.org/1999/xhtml}emu-alg'

              and e.attrib.get('id') == 'sec-__proto__-property-names-in-object-initializers'):

            # "__proto__ Property Names in Object Initializers" section

            # contains changes both for early errors and algorithm.

            # Ignore algorithm part.

            pass

        else:

            raise ValueError('unsupported element in early errors section: {}'

                             .format(child.tag))

def print_early_errors(parent_map, e):

    pairs = generate_early_errors_fragment_patch(parent_map, e)

    print_pairs(dedent_pairs(pairs))

def extract(filename, unfiltered, target):

    if filename.startswith("https:"):

        file_obj = urllib.request.urlopen(filename)

    else:

        file_obj = open(filename, "rb")

    with file_obj:

        document = html5lib.parse(file_obj)

    if target == 'grammar':

        for e in document.iter("{http://www.w3.org/1999/xhtml}emu-grammar"):

            if unfiltered or e.attrib.get("type") == "definition":

                print_fragment_patch(e)

    elif target == 'ee':

        parent_map = get_parent_map(document)

        for e in document.iter("{http://www.w3.org/1999/xhtml}emu-clause"):

            if e.attrib.get("id").endswith("-early-errors"):

                print_early_errors(parent_map, e)

    elif target == 'ee-annex':

        parent_map = get_parent_map(document)

        for e in document.iter("{http://www.w3.org/1999/xhtml}emu-annex"):

            if is_annex_early_errors(e):

                print_early_errors(parent_map, e)

    else:

        raise ValueError('Unknown target: {}'.format(target))

if __name__ == '__main__':

    parser = argparse.ArgumentParser(

        description="Extract esgrammar from ECMAScript specifications.")

    parser.add_argument(

        'url',

        nargs=1,

        help="the https: url or local filename of an HTML file containing <emu-grammar> tags")

    parser.add_argument(

        '--unfiltered',

        action='store_true',

        help="Include even <emu-grammar> elements that don't have `type=definition`")

    parser.add_argument(

        '--target',

        default='grammar',

        choices=['grammar', 'ee', 'ee-annex'],

        help="What to extract (\

        grammar = esgrammar, \

        ee = early errors, \

        ee-annex = early errors in Annex\

)")

    args = parser.parse_args()

    extract(args.url[0], args.unfiltered, args.target)