parse_esgrammar.py

mozilla-central/third_party/rust/jsparagus/js_parser/parse_esgrammar.py

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

"""Parse a grammar written in ECMArkup."""

from __future__ import annotations

# mypy: no-implicit-optional

import os

import collections

from typing import Dict, Iterable, Optional, Tuple

from jsparagus import parse_pgen, gen, grammar, extension, types

from jsparagus.lexer import LexicalGrammar

from jsparagus.ordered import OrderedSet, OrderedFrozenSet

ESGrammarLexer = LexicalGrammar(

    # the operators and keywords:

    "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "

    "but empty here lookahead no not of one or returns through Some None impl for let",

    NL="\n",

    # any number of colons together

    EQ=r':+',

    # terminals of the ES grammar, quoted with backticks

    T=r'`[^` \n]+`|```',

    # also terminals, denoting control characters

    CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',

    # nonterminals/types that will be followed by parameters

    NTCALL=r'[A-Za-z]\w*(?=[\[<])',

    # nonterminals (also, boolean parameters and type names)

    NT=r'[A-Za-z]\w*',

    # nonterminals wrapped in vertical bars for no apparent reason

    NTALT=r'\|[A-Z]\w+\|',

    # the spec also gives a few productions names

    PRODID=r'#[A-Za-z]\w*',

    # prose not wrapped in square brackets

    # To avoid conflict with the `>` token, this is recognized only after a space.

    PROSE=r'(?<= )>[^\n]*',

    # prose wrapped in square brackets

    WPROSE=r'\[>[^]]*\]',

    # expression denoting a matched terminal or nonterminal

    MATCH_REF=r'\$(?:0|[1-9][0-9]*)',

    # the spec also gives a few productions names

    RUSTCOMMENT=r'//.*\n',

ESGrammarParser = gen.compile(

    parse_pgen.load_grammar(

        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))

SIGIL_FALSE = '~'

SIGIL_TRUE = '+'

# Abbreviations for single-character terminals, used in the lexical grammar.

ECMASCRIPT_CODE_POINTS = {

    # From <https://tc39.es/ecma262/#table-31>

    '<ZWNJ>': grammar.Literal('\u200c'),

    '<ZWJ>': grammar.Literal('\u200d'),

    '<ZWNBSP>': grammar.Literal('\ufeff'),

    # From <https://tc39.es/ecma262/#table-32>

    '<TAB>': grammar.Literal('\t'),

    '<VT>': grammar.Literal('\u000b'),

    '<FF>': grammar.Literal('\u000c'),

    '<SP>': grammar.Literal(' '),

    '<NBSP>': grammar.Literal('\u00a0'),

    # <ZWNBSP> already defined above

    '<USP>': grammar.UnicodeCategory('Zs'),

    # From <https://tc39.es/ecma262/#table-33>

    '<LF>': grammar.Literal('\u000a'),

    '<CR>': grammar.Literal('\u000d'),

    '<LS>': grammar.Literal('\u2028'),

    '<PS>': grammar.Literal('\u2028'),

class ESGrammarBuilder:

    def __init__(self, terminal_names):

        # Names of terminals that are written as nonterminals in the grammar.

        # For example, "BooleanLiteral" is a terminal name when parsing the

        # syntactic grammar.

        if terminal_names is None:

            terminal_names = frozenset()

        self.terminal_names = frozenset(terminal_names)

        self.reset()

    def reset(self):

        self.lexer = None

        # This is how full-parsing and lazy-parsing are implemented, using

        # different traits.

        # This field contains the Rust's trait used for calling the method.

        # When a CallMethod is generated, it is assumed to be a function of

        # this trait. The trait is used by the Rust backend to generate

        # multiple backends which are implementing different set of traits.

        # Having the trait on the function call is useful as a way to filter

        # functions calls at code-generation time.

        # This field is updated by the `rust_param_impl`, which is used in

        # grammar extensions, and visited before producing any CallMethod.

        self.method_trait = "AstBuilder"

    def rust_edsl(self, impl, grammar):

        return extension.GrammarExtension(impl, grammar, self.lexer.filename)

    def rust_param_impl(self, trait, for_type, param):

        self.method_trait = trait

        return extension.ImplFor(param, trait, for_type)

    def rust_impl(self, trait, impl_type):

        return self.rust_param_impl(trait, impl_type, [])

    def rust_nt_def(self, lhs, rhs_line):

        # Right now, only focus on the syntactic grammar, and assume that all

        # rules are patching existing grammar production by adding code.

        return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))

    def rust_rhs_line(self, symbols):

        return self.rhs_line(None, symbols, None, None)

    def rust_expr(self, expr):

        assert isinstance(expr, grammar.CallMethod)

        return expr

    def empty(self):

        return []

    def single(self, x):

        return [x]

    def append(self, x, y):

        return x + [y]

    def concat(self, x, y):

        return x + y

    def blank_line(self):

        return []

    def nt_def_to_list(self, nt_def):

        return [nt_def]

    def to_production(self, lhs, i, rhs, is_sole_production):

        """Wrap a list of grammar symbols `rhs` in a Production object."""

        body, reducer, condition = rhs

        if reducer is None:

            reducer = self.default_reducer(lhs, i, body, is_sole_production)

        return grammar.Production(body, reducer, condition=condition)

    def default_reducer(self, lhs, i, body, is_sole_production):

        assert isinstance(lhs, grammar.Nt)

        nt_name = lhs.name

        nargs = sum(1 for e in body if grammar.is_concrete_element(e))

        if is_sole_production:

            method_name = nt_name

        else:

            method_name = '{} {}'.format(nt_name, i)

        return self.expr_call(method_name, tuple(range(nargs)), None)

    def needs_asi(self, lhs, p):

        """True if p is a production in which ASI can happen."""

        # The purpose of the fake ForLexicalDeclaration production is to have a

        # copy of LexicalDeclaration that does not trigger ASI.

        # Two productions have body == [";"] -- one for EmptyStatement and one

        # for ClassMember. Neither should trigger ASI.

        # The only other semicolons that should not trigger ASI are the ones in

        # `for` statement productions, which happen to be exactly those

        # semicolons that are not at the end of a production.

        return (not (isinstance(lhs, grammar.Nt)

                     and lhs.name == 'ForLexicalDeclaration')

                and len(p.body) > 1

                and p.body[-1] == ';')

    def apply_asi(self, p, reducer_was_autogenerated):

        """Return two rules based on p, so that ASI can be applied."""

        assert isinstance(p.reducer, grammar.CallMethod)

        if reducer_was_autogenerated:

            # Don't pass the semicolon to the method.

            reducer = self.expr_call(p.reducer.method,

                                     p.reducer.args[:-1],

                                     None)

        else:

            reducer = p.reducer

        # Except for do-while loops, check at runtime that ASI occurs only at

        # the end of a line.

        if (len(p.body) == 7

                and p.body[0] == 'do'

                and p.body[2] == 'while'

                and p.body[3] == '('

                and p.body[5] == ')'

                and p.body[6] == ';'):

            code = "do_while_asi"

        else:

            code = "asi"

        return [

            # The preferred production, with the semicolon in.

            p.copy_with(body=p.body[:],

                        reducer=reducer),

            # The fallback production, performing ASI.

            p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],

                        reducer=reducer),

    def expand_lexical_rhs(self, rhs):

        body, reducer, condition = rhs

        out = []

        for e in body:

            if isinstance(e, str):

                # The terminal symbols of the lexical grammar are characters, so

                # add each character of this string as a separate element.

                out += [grammar.Literal(ch) for ch in e]

            else:

                out.append(e)

        return [out, reducer, condition]

    def nt_def(self, nt_type, lhs, eq, rhs_list):

        has_sole_production = (len(rhs_list) == 1)

        production_list = []

        for i, rhs in enumerate(rhs_list):

            if eq == ':':

                # Syntactic grammar. A hack is needed for ASI.

                reducer_was_autogenerated = rhs[1] is None

                p = self.to_production(lhs, i, rhs, has_sole_production)

                if self.needs_asi(lhs, p):

                    production_list += self.apply_asi(p, reducer_was_autogenerated)

                else:

                    production_list.append(p)

            elif eq == '::':

                # Lexical grammar. A hack is needed to replace multicharacter

                # terminals like `!==` into sequences of character terminals.

                rhs = self.expand_lexical_rhs(rhs)

                p = self.to_production(lhs, i, rhs, has_sole_production)

                production_list.append(p)

        return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))

    def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):

        return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])

    def nt_lhs_no_params(self, name):

        return grammar.Nt(name, ())

    def nt_lhs_with_params(self, name, params):

        return grammar.Nt(name, tuple(params))

    def simple_type(self, name):

        return types.Type(name)

    def lifetime_type(self, name):

        return types.Lifetime(name)

    def parameterized_type(self, name, args):

        return types.Type(name, tuple(args))

    def t_list_line(self, terminals):

        return terminals

    def terminal(self, t):

        assert t[0] == "`"

        assert t[-1] == "`"

        return t[1:-1]

    def terminal_chr(self, chr):

        raise ValueError("FAILED: %r" % chr)

    def rhs_line(self, ifdef, rhs, reducer, _prodid):

        return (rhs, reducer, ifdef)

    def rhs_line_prose(self, prose):

        return ([prose], None, None)

    def empty_rhs(self):

        return []

    def expr_match_ref(self, token):

        assert token.startswith('$')

        return int(token[1:])

    def expr_call(self, method, args, fallible):

        # NOTE: Currently "AstBuilder" functions are made fallible using the

        # fallible_methods taken from some Rust code which extract this

        # information to produce a JSON file.

        if self.method_trait == "AstBuilder":

            fallible = None

        return grammar.CallMethod(method, args or (), types.Type(self.method_trait),

                                  fallible is not None)

    def expr_some(self, expr):

        return grammar.Some(expr)

    def expr_none(self):

        return None

    def ifdef(self, value, nt):

        return nt, value

    def optional(self, nt):

        return grammar.Optional(nt)

    def but_not(self, nt, exclusion):

        _, exclusion = exclusion

        return grammar.Exclude(nt, [exclusion])

        # return ('-', nt, exclusion)

    def but_not_one_of(self, nt, exclusion_list):

        exclusion_list = [exclusion for _, exclusion in exclusion_list]

        return grammar.Exclude(nt, exclusion_list)

        # return ('-', nt, exclusion_list)

    def no_line_terminator_here(self, lt):

        if lt not in ('LineTerminator', '|LineTerminator|'):

            raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))

        return grammar.NoLineTerminatorHere

    def nonterminal(self, name):

        if name in self.terminal_names:

            return name

        return grammar.Nt(name, ())

    def nonterminal_apply(self, name, args):

        if name in self.terminal_names:

            raise ValueError("parameters applied to terminal {!r}".format(name))

        if len(set(k for k, expr in args)) != len(args):

            raise ValueError("parameter passed multiple times")

        return grammar.Nt(name, tuple(args))

    def arg_expr(self, sigil, argname):

        if sigil == '?':

            return (argname, grammar.Var(argname))

        else:

            return (argname, sigil)

    def sigil_false(self):

        return False

    def sigil_true(self):

        return True

    def exclusion_terminal(self, t):

        return ("t", t)

    def exclusion_nonterminal(self, nt):

        return ("nt", nt)

    def exclusion_chr_range(self, c1, c2):

        return ("range", c1, c2)

    def la_eq(self, t):

        return grammar.LookaheadRule(OrderedFrozenSet([t]), True)

    def la_ne(self, t):

        return grammar.LookaheadRule(OrderedFrozenSet([t]), False)

    def la_not_in_nonterminal(self, nt):

        return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)

    def la_not_in_set(self, lookahead_exclusions):

        if all(len(excl) == 1 for excl in lookahead_exclusions):

            return grammar.LookaheadRule(

                OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),

                False)

        raise ValueError("unsupported: lookahead > 1 token, {!r}"

                         .format(lookahead_exclusions))

    def chr(self, t):

        assert t[0] == "<" or t[0] == 'U'

        if t[0] == "<":

            assert t[-1] == ">"

            if t not in ECMASCRIPT_CODE_POINTS:

                raise ValueError("unrecognized character abbreviation {!r}".format(t))

            return ECMASCRIPT_CODE_POINTS[t]

        else:

            assert t[1] == "+"

            return grammar.Literal(chr(int(t[2:], base=16)))

def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,

                   single_grammar=True, extensions=[]):

    nt_grammars = {}

    for nt_name, eq, _ in nt_defs:

        if nt_name in nt_grammars:

            raise ValueError(

                "duplicate definitions for nonterminal {!r}"

                .format(nt_name))

        nt_grammars[nt_name] = eq

    # Figure out which grammar we were trying to get (":" for syntactic,

    # "::" for lexical) based on the goal symbols.

    goals = list(goals)

    if len(goals) == 0:

        raise ValueError("no goal nonterminals specified")

    if single_grammar:

        selected_grammars = set(nt_grammars[goal] for goal in goals)

        assert len(selected_grammars) != 0

        if len(selected_grammars) > 1:

            raise ValueError(

                "all goal nonterminals must be part of the same grammar; "

                "got {!r} (matching these grammars: {!r})"

                .format(set(goals), set(selected_grammars)))

        [selected_grammar] = selected_grammars

    terminal_set = set()

    def hack_production(p):

        for i, e in enumerate(p.body):

            if isinstance(e, str) and e[:1] == "`":

                if len(e) < 3 or e[-1:] != "`":

                    raise ValueError(

                        "Unrecognized grammar symbol: {!r} (in {!r})"

                        .format(e, p))

                p[i] = token = e[1:-1]

                terminal_set.add(token)

    nonterminals = {}

    for nt_name, eq, rhs_list_or_lambda in nt_defs:

        if single_grammar and eq != selected_grammar:

            continue

        if isinstance(rhs_list_or_lambda, grammar.NtDef):

            nonterminals[nt_name] = rhs_list_or_lambda

        else:

            rhs_list = rhs_list_or_lambda

            for p in rhs_list:

                if not isinstance(p, grammar.Production):

                    raise ValueError(

                        "invalid grammar: ifdef in non-function-call context")

                hack_production(p)

            if nt_name in nonterminals:

                raise ValueError(

                    "unsupported: multiple definitions for nt " + nt_name)

            nonterminals[nt_name] = rhs_list

    for t in terminal_set:

        if t in nonterminals:

            raise ValueError(

                "grammar contains both a terminal `{}` and nonterminal {}"

                .format(t, t))

    # Add execution modes to generate the various functions needed to handle

    # syntax parsing and full parsing execution modes.

    exec_modes = collections.defaultdict(OrderedSet)

    noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))

    token_parser = types.Type("ParserTrait", (

        types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))

    ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))

    # Full parsing takes token as input and build an AST.

    exec_modes["full_actions"].extend([token_parser, ast_builder])

    # Syntax parsing takes token as input but skip building the AST.

    # TODO: The syntax parser is commented out for now, as we need something to

    # be produced when we cannot call the AstBuilder for producing the values.

    # No-op parsing is used for the simulator, which is so far used for

    # querying whether we can end the incremental input and lookup if a state

    # can accept some kind of tokens.

    exec_modes["noop_actions"].add(noop_parser)

    # Extensions are using an equivalent of Rust types to define the kind of

    # parsers to be used, this map is used to convert these type names to the

    # various execution modes.

    full_parser = types.Type("FullParser")

    syntax_parser = types.Type("SyntaxParser")

    noop_parser = types.Type("NoopParser")

    type_to_modes = {

        noop_parser: ["noop_actions", "full_actions"],

        syntax_parser: ["full_actions"],

        full_parser: ["full_actions"],

    result = grammar.Grammar(

        nonterminals,

        goal_nts=goals,

        variable_terminals=variable_terminals,

        synthetic_terminals=synthetic_terminals,

        exec_modes=exec_modes,

        type_to_modes=type_to_modes)

    result.patch(extensions)

    return result

def parse_esgrammar(

        text: str,

*,

        filename: Optional[str] = None,

        extensions: Iterable[Tuple[os.PathLike, int, str]] = (),

        goals: Optional[Iterable[str]] = None,

        terminal_names: Iterable[str] = (),

        synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,

        single_grammar: bool = True

) -> grammar.Grammar:

    if not text.endswith("\n\n"):

        # Horrible hack: add a blank line at the end of the document so that

        # the esgrammar grammar can use newlines as delimiters. :-P

        text += "\n"

    terminal_names = frozenset(terminal_names)

    if synthetic_terminals is None:

        synthetic_terminals = {}

    builder = ESGrammarBuilder(terminal_names)

    parser = ESGrammarParser(builder=builder, goal="grammar")

    lexer = ESGrammarLexer(parser, filename=filename)

    lexer.write(text)

    nt_defs = lexer.close()

    grammar_extensions = []

    for ext_filename, start_lineno, content in extensions:

        builder.reset()

        parser = ESGrammarParser(builder=builder, goal="rust_edsl")

        lexer = ESGrammarLexer(parser, filename=ext_filename)

        builder.lexer = lexer

        lexer.start_lineno = start_lineno

        lexer.write(content)

        result = lexer.close()

        grammar_extensions.append(result)

    if goals is None:

        # Default to the first nonterminal in the input.

        goals = [nt_defs[0][0]]

    return finish_grammar(

        nt_defs,

        goals=goals,

        variable_terminals=terminal_names - frozenset(synthetic_terminals),

        synthetic_terminals=synthetic_terminals,

        single_grammar=single_grammar,

        extensions=grammar_extensions)