grammar.py - mozsearch

""" Data structures for representing grammars. """

from __future__ import annotations

# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls

import copy

import typing

import dataclasses

from dataclasses import dataclass

from .ordered import OrderedSet, OrderedFrozenSet

from . import types

# *** What is a grammar? ******************************************************

# A grammar is a dictionary mapping nonterminal names to lists of right-hand

# sides. Each right-hand side (also called a "production") is a list whose

# elements can include terminals, nonterminals, Optional elements, LookaheadRules,

# and NoLineTerminatorHere.

# The most common elements are terminals and nonterminals, so a grammar usually

# looks something like this:

def example_grammar() -> Grammar:

    rules: typing.Dict[typing.Union[str, InitNt, Nt], LenientNtDef] = {

        'expr': [

            ['term'],

            ['expr', '+', 'term'],

            ['expr', '-', 'term'],

],

        'term': [

            ['unary'],

            ['term', '*', 'unary'],

            ['term', '/', 'unary'],

],

        'unary': [

            ['prim'],

            ['-', 'unary'],

],

        'prim': [

            ['NUM'],

            ['VAR'],

            ['(', 'expr', ')'],

],

    # The goal nonterminals are the nonterminals we're actually interested in

    # parsing. Here we want to parse expressions; all the other nonterminals

    # are interesting only as the building blocks of expressions.

    # Variable terminals are terminal symbols that can have several different

    # values, like a VAR token that could be any identifier, or a NUM token

    # that could be any number.

    return Grammar(rules, goal_nts=['expr'], variable_terminals=['NUM', 'VAR'])

Condition = typing.Tuple[str, bool]

# A production consists of a left side, an optional condition, a right side,

# and a reducer. A `Production` object includes everything except the left

# side. Incorporating reducers lets us transform a grammar while preserving

# behavior.

# The production `expr ::= term` is represented by

# `Production(["term"], 0)`.

# The production `expr ::= expr + term => add($0, $2)` is represented by

# `Production(["expr", "+", "term"], CallMethod("add", (0, 2))`.

@dataclass

class Production:

    __slots__ = ['body', 'reducer', 'condition']

    body: typing.List[Element]

    reducer: ReduceExprOrAccept

    condition: typing.Optional[Condition]

    def __init__(self,

                 body: typing.List[Element],

                 reducer: ReduceExprOrAccept,

*,

                 condition: typing.Optional[Condition] = None):

        self.body = body

        self.reducer = reducer

        self.condition = condition

    def __repr__(self) -> str:

        if self.condition is None:

            return "Production({!r}, reducer={!r})".format(self.body, self.reducer)

        else:

            return ("Production({!r}, reducer={!r}, condition={!r})"

                    .format(self.body, self.reducer, self.condition))

    def copy_with(self, **kwargs: typing.Any) -> Production:

        return dataclasses.replace(self, **kwargs)

# *** Reduce expressions ******************************************************

# Reduce expressions ("reducers" for short) are a little language used to

# specify what happens when a production is matched. A reduce expression is

# one of these:

# *   An integer evaluates to one of the syntactic components of the

#     production. For example, if the production we're reducing is

#     `sum ::= sum + term`, the integer `0` evaluates the `sum` to the left of

#     the plus sign, and `2` means the `term` on the right. (The integer

#     `1` would refer to the `+` token itself, but that's not super useful.)

#     These integers are not quite indexes into `production.body`, because

#     LookaheadRule, ErrorSymbol, and NoLineTerminatorHere elements don't

#     count: in the production `stmt ::= [lookahead != "let"] expr ";"`, `0` is

#     the expr, and `1` is the semicolon token.  See `is_concrete_element(e)`.

# *   CallMethod objects pass values to a builder method and return the result.

#     The `args` are nested reduce expressions.

# *   None is an expression used as a placeholder when an optional symbol is

#     omitted.

# *   Some(expr) is used when an optional symbol is found and parsed.

#     In Python, this just expands to the same thing as `expr`, but in Rust

#     this expands to a use of `Option::Some()`.

# In addition, the special reducer 'accept' means stop parsing. This is

# used only in productions for init nonterminals, created automatically by

# Grammar.__init__(). It's not a reduce expression, so it can't be nested.

@dataclass(frozen=True)

class CallMethod:

    """Express a method call, and give it a given set of arguments. A trait is

    added as the parser should implement this trait to call this method."""

    method: str

    args: typing.Tuple[ReduceExpr, ...]

    trait: types.Type = types.Type("AstBuilder")

    fallible: bool = False

@dataclass(frozen=True)

class Some:

    inner: ReduceExpr

def expr_to_str(expr: ReduceExprOrAccept) -> str:

    if isinstance(expr, int):

        return "${}".format(expr)

    elif isinstance(expr, CallMethod):

        return "{}::{}({}){}".format(

            expr.trait, expr.method,

            ', '.join(expr_to_str(arg) for arg in expr.args),

            expr.fallible and '?' or '')

    elif expr is None:

        return "None"

    elif isinstance(expr, Some):

        return "Some({})".format(expr_to_str(expr.inner))

    elif expr == "accept":

        return "<accept>"

    else:

        raise ValueError("unrecognized expression: {!r}".format(expr))

# Type parameter for Grammar.intern().

Internable = typing.TypeVar("Internable")

SyntheticTerminalsDict = typing.Dict[str, OrderedFrozenSet[str]]

class Grammar:

    """A collection of productions.

    *   self.variable_terminals - OrderedFrozenSet[str] - Terminals that carry

        data, like (in JS) numeric literals and RegExps.

    *   self.terminals - OrderedFrozenSet[str] - All terminals used in the

        language, including those in self.variable_terminals and

        self.synthetic_terminals.

    *   self.synthetic_terminals - {str: OrderedFrozenSet[str]} - Maps terminals

        to sets of terminals. An entry `name: set` in this dictionary means

        that `name` is shorthand for "one of the terminals in `set`".

    *   self.nonterminals - {LenientNt: NtDef} - Keys are either (str|InitNt), early

        in the pipeline, or Nt objects later on. Values are the NtDef objects

        that contain the actual Productions.

    *   self.methods - {str: MethodType} - Type information for methods.

    *   self.init_nts - [InitNt or Nt] - The list of all elements of

        self.nonterminals.keys() that are init nonterminals.

    *   self.exec_modes - DefaultDict{str, OrderedSet[Type]} or None - ?

    *   self.type_to_mods - {Type: [str]} or None - ?

    *   self._cache - {Any: Any} - Cache of immutable objects used by

        Grammar.intern().

"""

    variable_terminals: OrderedFrozenSet[str]

    terminals: OrderedFrozenSet[typing.Union[str, End]]

    synthetic_terminals: SyntheticTerminalsDict

    nonterminals: typing.Dict[LenientNt, NtDef]

    methods: typing.Dict[str, types.MethodType]

    init_nts: typing.List[typing.Union[Nt, InitNt]]

    exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]]

    type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]]

    _cache: typing.Dict[typing.Any, typing.Any]

    def __init__(

            self,

            nonterminals: typing.Mapping[LenientNt, LenientNtDef],

*,

            goal_nts: typing.Optional[typing.Iterable[LenientNt]] = None,

            variable_terminals: typing.Iterable[str] = (),

            synthetic_terminals: SyntheticTerminalsDict = None,

            method_types: typing.Optional[typing.Dict[str, types.MethodType]] = None,

            exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] = None,

            type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] = None):

        # This constructor supports passing in a sort of jumbled blob of

        # strings, lists, and actual objects, and normalizes it all to a more

        # typeful structure. Being able to interpret simple

        # "list-of-lists-of-strings" input is super useful for tests.

        # We don't check here that the grammar is LR, that it's cycle-free, or

        # any other nice properties.

        # Copy/infer the arguments.

        my_nonterminals: typing.Dict[LenientNt, LenientNtDef] = dict(nonterminals.items())

        if goal_nts is None:

            # Default to the first nonterminal in the dictionary.

            my_goal_nts = []

            for name in my_nonterminals:

                my_goal_nts.append(name)

                break

        else:

            my_goal_nts = list(goal_nts)

        self.variable_terminals = OrderedFrozenSet(variable_terminals)

        if synthetic_terminals is None:

            synthetic_terminals = {}

        else:

            synthetic_terminals = {

                name: OrderedFrozenSet(set)

                for name, set in synthetic_terminals.items()

            for synthetic_key, values in synthetic_terminals.items():

                if synthetic_key in my_nonterminals:

                    raise ValueError(

                        "invalid grammar: {!r} is both a terminal and a nonterminal"

                        .format(synthetic_key))

                for t in values:

                    if t in my_nonterminals:

                        raise ValueError(

                            "invalid grammar: {!r} is both ".format(t)

                            + "a representation of a synthetic terminal and a nonterminal")

                    if t in synthetic_terminals:

                        # Nested synthetic terminals. Throw, for now. (This

                        # should pretty much just work, except expand_terminal

                        # doesn't support it; and we don't check for cycles

                        # where a synthetic terminal includes itself directly

                        # or indirectly.)

                        raise ValueError(

                            "unsupported: synthetic terminals can't include other "

                            "synthetic terminals; {!r} includes {!r}"

                            .format(synthetic_key, t))

        # self.synthetic_terminals = synthetic_terminals

        self.synthetic_terminals = {}

        keys_are_nt = isinstance(next(iter(my_nonterminals)), Nt)

        key_type: typing.Union[typing.Type, typing.Tuple[typing.Type, ...]]

        key_type = Nt if keys_are_nt else (str, InitNt)

        self._cache = {}

        # Gather some information just by looking at keys (without examining

        # every production).

        # str_to_nt maps the name of each non-parameterized

        # nonterminal to `Nt(name)`, a cache.

        str_to_nt: typing.Dict[typing.Union[str, InitNt], Nt] = {}

        # nt_params lists the names of each nonterminal's parameters (empty

        # tuple for non-parameterized nts).

        nt_params: typing.Dict[typing.Union[str, InitNt], typing.Tuple[str, ...]] = {}

        for key in my_nonterminals:

            if not isinstance(key, key_type):

                raise ValueError(

                    "invalid grammar: conflicting key types in nonterminals dict - "

                    "expected either all str or all Nt, got {!r}"

                    .format(key.__class__.__name__))

            nt_name: typing.Union[str, InitNt]

            param_names: typing.Tuple[str, ...]

            if keys_are_nt:

                assert isinstance(key, Nt)

                nt_name = key.name

                param_names = tuple(name for name, value in key.args)

            else:

                assert isinstance(key, (str, InitNt))

                nt_name = key

                param_names = ()

                my_nt = my_nonterminals[key]

                if isinstance(my_nt, NtDef):

                    param_names = tuple(my_nt.params)

            if nt_name not in nt_params:

                nt_params[nt_name] = param_names

            else:

                if nt_params[nt_name] != param_names:

                    raise ValueError(

                        "conflicting parameter name lists for nt {!r}: "

                        "both {!r} and {!r}"

                        .format(nt_name, nt_params[nt_name], param_names))

            if param_names == () and nt_name not in str_to_nt:

                str_to_nt[nt_name] = self.intern(Nt(nt_name))

        # Validate, desugar, and copy the grammar. As a side effect, calling

        # validate_element on every element of the grammar populates

        # all_terminals.

        all_terminals: OrderedSet[typing.Union[str, End]] = OrderedSet(self.variable_terminals)

        all_terminals.add(End())

        def note_terminal(t: str) -> None:

            """Add t (and all representations of it, if synthetic) to all_terminals."""

            if t not in all_terminals:

                all_terminals.add(t)

                if t in self.synthetic_terminals:

                    for k in self.synthetic_terminals[t]:

                        note_terminal(k)

        # Note: i and j are normally list indexes, but they are sometimes the

        # special string '?'. It's OK because they are used only in error

        # messages.

        def validate_element(

                nt: LenientNt,

                i: typing.Union[int, str],

                j: typing.Union[int, str],

                e: Element,

                context_params: typing.Tuple[str, ...]

        ) -> Element:

            if isinstance(e, str):

                if e in nt_params:

                    if nt_params[e] != ():

                        raise ValueError(

                            "invalid grammar: missing parameters for {!r} "

                            "in production `grammar[{!r}][{}][{}].inner`: {!r}"

                            .format(e, nt, i, j, nt_params[e]))

                    return str_to_nt[e]

                else:

                    note_terminal(e)

                    return e

            elif isinstance(e, Optional):

                if not isinstance(e.inner, (str, Nt)):

                    raise TypeError(

                        "invalid grammar: unrecognized element "

                        "in production `grammar[{!r}][{}][{}].inner`: {!r}"

                        .format(nt, i, j, e.inner))

                inner = validate_element(nt, i, j, e.inner, context_params)

                return self.intern(Optional(inner))

            elif isinstance(e, Literal):

                if not isinstance(e.text, str):

                    raise TypeError(

                        "invalid grammar: unrecognized element "

                        "in production `grammar[{!r}][{}][{}].text`: {!r}"

                        .format(nt, i, j, e.text))

                return self.intern(e)

            elif isinstance(e, UnicodeCategory):

                if not isinstance(e.cat_prefix, str):

                    raise TypeError(

                        "invalid grammar: unrecognized element "

                        "in production `grammar[{!r}][{}][{}].cat_prefix`: {!r}"

                        .format(nt, i, j, e.cat_prefix))

                return self.intern(e)

            elif isinstance(e, Exclude):

                if not isinstance(e.inner, (str, Nt)):

                    raise TypeError(

                        "invalid grammar: unrecognized element "

                        "in production `grammar[{!r}][{}][{}].inner`: {!r}"

                        .format(nt, i, j, e.inner))

                exclusion_list = []

                for value in e.exclusion_list:

                    if not isinstance(value, (str, Nt)):

                        raise TypeError(

                            "invalid grammar: unrecognized element "

                            "in production `grammar[{!r}][{}][{}].exclusion_list`: {!r}"

                            .format(nt, i, j, value))

                    value = validate_element(nt, i, j, value, context_params)

                    exclusion_list.append(value)

                inner = validate_element(nt, i, j, e.inner, context_params)

                return self.intern(Exclude(inner, tuple(exclusion_list)))

            elif isinstance(e, Nt):

                # Either the application or the original parameterized

                # production must be present in the dictionary.

                if e not in my_nonterminals and e.name not in my_nonterminals:

                    raise ValueError(

                        "invalid grammar: unrecognized nonterminal "

                        "in production `grammar[{!r}][{}][{}]`: {!r}"

                        .format(nt, i, j, e.name))

                args = tuple(pair[0] for pair in e.args)

                if e.name in nt_params and args != nt_params[e.name]:

                    raise ValueError(

                        "invalid grammar: wrong arguments passed to {!r} "

                        "in production `grammar[{!r}][{}][{}]`: "

                        "passed {!r}, expected {!r}"

                        .format(e.name, nt, i, j,

                                args, nt_params[e.name]))

                for param_name, arg_expr in e.args:

                    if isinstance(arg_expr, Var):

                        if arg_expr.name not in context_params:

                            raise ValueError(

                                "invalid grammar: undefined variable {!r} "

                                "in production `grammar[{!r}][{}][{}]`"

                                .format(arg_expr.name, nt, i, j))

                return self.intern(e)

            elif isinstance(e, (LookaheadRule, End, ErrorSymbol)):

                return self.intern(e)

            elif e is NoLineTerminatorHere:

                return e

            elif isinstance(e, CallMethod):

                return self.intern(e)

            else:

                raise TypeError(

                    "invalid grammar: unrecognized element in production "

                    "`grammar[{!r}][{}][{}]`: {!r}"

                    .format(nt, i, j, e))

            assert False, "unreachable"

        def check_reduce_expr(

                nt: LenientNt,

                i: int,

                rhs: Production,

                expr: ReduceExprOrAccept) -> None:

            if isinstance(expr, int):

                concrete_len = sum(1 for e in rhs.body

                                   if is_concrete_element(e))

                if not (0 <= expr < concrete_len):

                    raise ValueError(

                        "invalid grammar: element number {} out of range for "

                        "production {!r} in grammar[{!r}][{}].reducer ({!r})"

                        .format(expr, nt, rhs.body, i, rhs.reducer))

            elif isinstance(expr, CallMethod):

                if not isinstance(expr.method, str):

                    raise TypeError(

                        "invalid grammar: method names must be strings, "

                        "not {!r}, in grammar[{!r}[{}].reducer"

                        .format(expr.method, nt, i))

                if not expr.method.isidentifier():

                    name, space, pn = expr.method.partition(' ')

                    if space == ' ' and name.isidentifier() and pn.isdigit():

                        pass

                    else:

                        raise ValueError(

                            "invalid grammar: invalid method name {!r} "

                            "(not an identifier), in grammar[{!r}[{}].reducer"

                            .format(expr.method, nt, i))

                for arg_expr in expr.args:

                    check_reduce_expr(nt, i, rhs, arg_expr)

            elif expr is None:

                pass

            elif isinstance(expr, Some):

                check_reduce_expr(nt, i, rhs, expr.inner)

            else:

                raise TypeError(

                    "invalid grammar: unrecognized reduce expression {!r} "

                    "in grammar[{!r}][{}].reducer"

                    .format(expr, nt, i))

        def copy_rhs(

                nt: LenientNt,

                i: int,

                sole_production: bool,

                rhs: LenientProduction,

                context_params: typing.Tuple[str, ...]) -> Production:

            if isinstance(rhs, list):

                # Bare list, no reducer. Desugar to a Production, inferring a

                # reasonable default reducer.

                nargs = sum(1 for e in rhs if is_concrete_element(e))

                reducer: ReduceExpr

                if len(rhs) == 1 and nargs == 1:

                    reducer = 0  # don't call a method, just propagate the value

                else:

                    # Call a method named after the production. If the

                    # nonterminal has exactly one production, there's no need

                    # to include the production index `i` to the method name.

                    if sole_production:

                        method = str(nt)

                    else:

                        method = '{}_{}'.format(nt, i)

                    reducer = CallMethod(method, tuple(range(nargs)))

                rhs = Production(rhs, reducer)

            if not isinstance(rhs, Production):

                raise TypeError(

                    "invalid grammar: grammar[{!r}][{}] should be "

                    "a Production or list of grammar symbols, not {!r}"

                    .format(nt, i, rhs))

            if rhs.condition is not None:

                param, value = rhs.condition

                if param not in context_params:

                    raise TypeError(

                        "invalid grammar: undefined parameter {!r} "

                        "in conditional for grammar[{!r}][{}]"

                        .format(param, nt, i))

            if rhs.reducer != 'accept':

                check_reduce_expr(nt, i, rhs, rhs.reducer)

            assert isinstance(rhs.body, list)

            return rhs.copy_with(body=[

                validate_element(nt, i, j, e, context_params)

                for j, e in enumerate(rhs.body)

])

        def copy_nt_def(

                nt: LenientNt,

                nt_def: typing.Union[NtDef, typing.List[LenientProduction]],

        ) -> NtDef:

            rhs_list: typing.Sequence[LenientProduction]

            if isinstance(nt_def, NtDef):

                for i, param in enumerate(nt_def.params):

                    if not isinstance(param, str):

                        raise TypeError(

                            "invalid grammar: parameter {} of {} should be "

                            "a string, not {!r}"

                            .format(i + 1, nt, param))

                params = nt_def.params

                rhs_list = nt_def.rhs_list

                ty = nt_def.type

            else:

                params = ()

                rhs_list = nt_def

                ty = None

            if not isinstance(rhs_list, list):

                raise TypeError(

                    "invalid grammar: grammar[{!r}] should be either a "

                    "list of right-hand sides or NtDef, not {!r}"

                    .format(nt, type(rhs_list).__name__))

            sole_production = len(rhs_list) == 1

            productions = [copy_rhs(nt, i, sole_production, rhs, params)

                           for i, rhs in enumerate(rhs_list)]

            return NtDef(params, productions, ty)

        def check_nt_key(nt: LenientNt) -> None:

            if isinstance(nt, str):

                if not nt.isidentifier():

                    raise ValueError(

                        "invalid grammar: nonterminal names must be identifiers, not {!r}"

                        .format(nt))

                if nt in self.variable_terminals or nt in self.synthetic_terminals:

                    raise TypeError(

                        "invalid grammar: {!r} is both a nonterminal and a variable terminal"

                        .format(nt))

            elif isinstance(nt, Nt):

                assert keys_are_nt  # checked earlier

                if not (isinstance(nt.name, (str, InitNt))

                        and isinstance(nt.args, tuple)):

                    raise TypeError(

                        "invalid grammar: expected str or Nt(name=str, "

                        "args=tuple) keys in nonterminals dict, got {!r}"

                        .format(nt))

                check_nt_key(nt.name)

                for pair in nt.args:

                    if (not isinstance(pair, tuple)

                            or len(pair) != 2

                            or not isinstance(pair[0], str)

                            or not isinstance(pair[1], bool)):

                        raise TypeError(

                            "invalid grammar: expected tuple((str, bool)) args, got {!r}"

                            .format(nt))

            elif isinstance(nt, InitNt):

                # Users don't include init nonterminals when initially creating

                # a Grammar. They are automatically added below. But if this

                # Grammar is being created by hacking on a previous Grammar, it

                # will already have them.

                if not isinstance(nt.goal, Nt):

                    raise TypeError(

                        "invalid grammar: InitNt.goal should be a nonterminal, "

                        "got {!r}"

                        .format(nt))

                # nt.goal is a "use", not a "def". Check it like a use.

                # Bogus question marks appear in error messages :-|

                validate_element(nt, '?', '?', nt.goal, ())

                if nt.goal not in my_goal_nts:

                    raise TypeError(

                        "invalid grammar: nonterminal referenced by InitNt "

                        "is not in the list of goals: {!r}"

                        .format(nt))

            else:

                raise TypeError(

                    "invalid grammar: expected string keys in nonterminals dict, got {!r}"

                    .format(nt))

        def validate_nt(

                nt: LenientNt,

                nt_def: LenientNtDef

        ) -> typing.Tuple[LenientNt, NtDef]:

            check_nt_key(nt)

            if isinstance(nt, InitNt):

                # Check the form of init productions. Initially these look like

                # [[goal]], but after the pipeline goes to work, they can be

                # [[Optional(goal)]] or [[], [goal]].

                if not isinstance(nt_def, NtDef):

                    raise TypeError(

                        "invalid grammar: key {!r} must map to "

                        "value of type NtDef, not {!r}"

                        .format(nt, nt_def))

                rhs_list = nt_def.rhs_list

                g = nt.goal

                if (rhs_list != [Production([g], 0),

                                 Production([Nt(nt, ()), End()], 'accept')]

                    and rhs_list != [Production([Optional(g)], 0),

                                     Production([Nt(nt, ()), End()], 'accept')]

                    and rhs_list != [Production([End()], 'accept'),

                                     Production([g, End()], 'accept'),

                                     Production([Nt(nt, ()), End()], 'accept')]):

                    raise ValueError(

                        "invalid grammar: grammar[{!r}] is not one of "

                        "the expected forms: got {!r}"

                        .format(nt, rhs_list))

            return nt, copy_nt_def(nt, nt_def)

        self.nonterminals = {}

        for nt1, nt_def1 in my_nonterminals.items():

            nt, nt_def = validate_nt(nt1, nt_def1)

            self.nonterminals[nt] = nt_def

        for syn_term_name, t_set in synthetic_terminals.items():

            nt_def = NtDef((syn_term_name,), [Production([e], 0) for e in t_set], None)

            nt, nt_def = validate_nt(syn_term_name, nt_def)

            self.nonterminals[nt] = nt_def

            # Remove synthetic terminals from the list of terminals.

            all_terminals.remove(syn_term_name)

        self.terminals = OrderedFrozenSet(all_terminals)

        # Check types of reduce expressions and infer method types. But if the

        # caller passed in precalculated type info, skip it -- otherwise we

        # would redo type checking many times as we make minor changes to the

        # Grammar along the pipeline.

        if method_types is None:

            types.infer_types(self)

        else:

            for nt, nt_def in self.nonterminals.items():

                assert isinstance(nt_def, NtDef)

                assert isinstance(nt_def.type, types.Type)

            self.methods = method_types

        # Synthesize "init" nonterminals.

        self.init_nts = []

        for goal in my_goal_nts:

            # Convert str goals to Nt objects and validate.

            if isinstance(goal, str):

                ok = goal in str_to_nt

                if ok:

                    goal = str_to_nt[goal]

            elif isinstance(goal, Nt):

                if keys_are_nt:

                    ok = goal in my_nonterminals

                else:

                    ok = goal.name in my_nonterminals

            if not ok:

                raise ValueError(

                    "goal nonterminal {!r} is undefined".format(goal))

            assert isinstance(goal, Nt)

            # Weird, but the key of an init nonterminal really is

            # `Nt(InitNt(Nt(goal_name, goal_args)), ())`. It takes no arguments,

            # but it refers to a goal that might take arguments.

            init_nt = InitNt(goal)

            init_key: LenientNt = init_nt

            goal_nt = Nt(init_nt, ())

            if keys_are_nt:

                init_key = goal_nt

            if init_key not in self.nonterminals:

                self.nonterminals[init_key] = NtDef(

(),

                    [Production([goal], 0),

                     Production([goal_nt, End()], 'accept')],

                    types.NoReturnType)

            self.init_nts.append(goal_nt)

        # Add the various execution backends which would rely on the same parse table.

        self.exec_modes = exec_modes

        self.type_to_modes = type_to_modes

    # The argument is a list of extension.GrammarExtensions. The annotation is

    # vague because this module does not import the extension module. It would

    # be a cyclic dependency.

    def patch(self, extensions: typing.List) -> None:

        assert self.type_to_modes is not None

        assert self.exec_modes is not None

        if extensions == []:

            return

        # Copy of nonterminals which would be mutated by the patches.

        nonterminals = copy.copy(self.nonterminals)

        for ext in extensions:

            # Add the given trait to the execution mode, depending on which

            # type it got implemented for.

            for mode in self.type_to_modes[ext.target.for_type]:

                self.exec_modes[mode].add(ext.target.trait)

            # Apply grammar transformations.

            ext.apply_patch(self, nonterminals)

        # Replace with the modified version of nonterminals

        self.nonterminals = nonterminals

    def intern(self, obj: Internable) -> Internable:

        """Return a shared copy of the immutable object `obj`.

        This saves memory and consistent use allows code to use `is` for

        equality testing.

"""

        try:

            return self._cache[obj]

        except KeyError:

            self._cache[obj] = obj

            return obj

    # Terminals are tokens that must appear verbatim in the input wherever they

    # appear in the grammar, like the operators '+' '-' *' '/' and brackets '(' ')'

    # in the example grammar.

    def is_terminal(self, element: object) -> bool:

        return type(element) is str

    def expand_set_of_terminals(

            self,

            terminals: typing.Iterable[typing.Union[str, None, ErrorSymbol]]

    ) -> OrderedSet[typing.Union[str, None, ErrorSymbol]]:

        """Copy a set of terminals, replacing any synthetic terminals with their representations.

        Returns a new OrderedSet.

        terminals - an iterable of terminals and/or other unique elements like

        None or ErrorSymbol.

"""

        result: OrderedSet[typing.Union[str, None, ErrorSymbol]] = OrderedSet()

        for t in terminals:

            if isinstance(t, str) and t in self.synthetic_terminals:

                result |= self.expand_set_of_terminals(self.synthetic_terminals[t])

            else:

                result.add(t)

        return result

    def goals(self) -> typing.List[Nt]:

        """Return a list of this grammar's goal nonterminals."""

        return [init_nt.name.goal for init_nt in self.init_nts]  # type: ignore

    def with_nonterminals(

            self,

            nonterminals: typing.Mapping[LenientNt, LenientNtDef]

    ) -> Grammar:

        """Return a copy of self with the same attributes except different nonterminals."""

        if self.methods is not None:

            for nt_def in nonterminals.values():

                assert isinstance(nt_def, NtDef)

                assert nt_def.type is not None

        return Grammar(

            nonterminals,

            goal_nts=self.goals(),

            variable_terminals=self.variable_terminals,

            synthetic_terminals=self.synthetic_terminals,

            method_types=self.methods,

            exec_modes=self.exec_modes,

            type_to_modes=self.type_to_modes)

    # === A few methods for dumping pieces of grammar.

    def element_to_str(self, e: Element) -> str:

        if isinstance(e, Nt):

            return e.pretty()

        elif self.is_terminal(e):

            assert isinstance(e, str)

            if e in self.variable_terminals or e in self.synthetic_terminals:

                return e

            return '"' + repr(e)[1:-1] + '"'

        elif isinstance(e, Optional):

            return self.element_to_str(e.inner) + "?"

        elif isinstance(e, LookaheadRule):

            if len(e.set) == 1:

                op = "==" if e.positive else "!="

                s = repr(list(e.set)[0])

            else:

                op = "in" if e.positive else "not in"

                s = '{' + repr(list(e.set))[1:-1] + '}'

            return "[lookahead {} {}]".format(op, s)

        elif isinstance(e, End):

            return "<END>"

        elif e is NoLineTerminatorHere:

            return "[no LineTerminator here]"

        elif isinstance(e, CallMethod):

            return "{{ {} }}".format(expr_to_str(e))

        else:

            return str(e)

    def symbols_to_str(self, rhs: typing.Iterable[Element]) -> str:

        return " ".join(self.element_to_str(e) for e in rhs)

    def rhs_to_str(self, rhs: LenientProduction) -> str:

        if isinstance(rhs, Production):

            if rhs.condition is None:

                prefix = ''

            else:

                param, value = rhs.condition

                if value is True:

                    condition = "+" + param

                elif value is False:

                    condition = "~" + param

                else:

                    condition = "{} == {!r}".format(param, value)

                prefix = "#[if {}] ".format(condition)

            return prefix + self.rhs_to_str(rhs.body)

        elif len(rhs) == 0:

            return "[empty]"

        else:

            return self.symbols_to_str(rhs)

    def nt_to_str(self, nt: LenientNt) -> str:

        if isinstance(nt, Nt):

            return self.element_to_str(nt)

        else:

            return str(nt)

    def production_to_str(

            self,

            nt: LenientNt,

            rhs: LenientProduction,

            *reducer: ReduceExpr

    ) -> str:

        # As we have two ways of representing productions at the moment, just

        # take multiple arguments :(

        return "{} ::= {}{}".format(

            self.nt_to_str(nt),

            self.rhs_to_str(rhs),

            "".join(" => " + expr_to_str(expr) for expr in reducer))

    # The type of `item` is `lr0.LRItem`. No annotation because this module

    # does not import `lr0`. It would be a cyclic dependency.

    def lr_item_to_str(self, prods: typing.List, item: typing.Any) -> str:

        prod = prods[item.prod_index]

        if item.lookahead is None:

            la = []

        else:

            la = [self.element_to_str(item.lookahead)]

        return "{} ::= {} >> {{{}}}".format(

            self.element_to_str(prod.nt),

            " ".join([self.element_to_str(e) for e in prod.rhs[:item.offset]]

                     + ["\N{MIDDLE DOT}"]

                     + la

                     + [self.element_to_str(e) for e in prod.rhs[item.offset:]]),

            ", ".join(

                "$" if t is None else self.element_to_str(t)

                for t in item.followed_by)

    def item_set_to_str(

            self,

            prods: typing.List,

            item_set: OrderedFrozenSet

    ) -> str:

        return "{{{}}}".format(

            ",  ".join(self.lr_item_to_str(prods, item) for item in item_set)

    def expand_terminal(self, t: str) -> OrderedFrozenSet[str]:

        return self.synthetic_terminals.get(t) or OrderedFrozenSet([t])

    def compatible_elements(self, e1: Element, e2: Element) -> bool:

        # "type: ignore" because mypy doesn't know that `self.is_terminal(e1)`

        # means `e1` is a terminal, and thus `self.expand_terminal(e1)` is OK.

        return (e1 == e2

                or (self.is_terminal(e1)

                    and self.is_terminal(e2)

                    and len(self.expand_terminal(e1)  # type: ignore

                            & self.expand_terminal(e2)) > 0))  # type: ignore

    def compatible_sequences(

            self,

            seq1: typing.Sequence[Element],

            seq2: typing.Sequence[Element]) -> bool:

        """True if the two sequences could be the same terminals."""

        return (len(seq1) == len(seq1)

                and all(self.compatible_elements(e1, e2) for e1, e2 in zip(seq1, seq2)))

    def dump(self) -> None:

        for nt, nt_def in self.nonterminals.items():

            left_side = self.nt_to_str(nt)

            if nt_def.params:

                left_side += "[" + ", ".join(nt_def.params) + "]"

            print(left_side + " ::=")

            for rhs in nt_def.rhs_list:

                print("   ", self.rhs_to_str(rhs))

            print()

    def dump_type_info(self) -> None:

        for nt, nt_def in self.nonterminals.items():

            print(nt, nt_def.type)

        for name, mty in self.methods.items():

            print("fn {}({}) -> {}"

                  .format(name,

                          ", ".join(str(ty) for ty in mty.argument_types),

                          str(mty.return_type)))

    def is_shifted_element(self, e: Element) -> bool:

        if isinstance(e, Nt):

            return True

        elif self.is_terminal(e):

            return True

        elif isinstance(e, Optional):

            return True

        elif isinstance(e, LookaheadRule):

            return False

        elif isinstance(e, End):

            return True

        elif e is NoLineTerminatorHere:

            return True

        return False

@dataclass(frozen=True)

class InitNt:

    """InitNt(goal) is the name of the init nonterminal for the given goal.

    One init nonterminal is created internally for each goal symbol in the grammar.

    The idea is to have a nonterminal that the user has no control over, that is

    never used in any production, but *only* as an entry point for the grammar,

    that always has a single production "init_nt ::= goal_nt". This predictable

    structure makes it easier to get into and out of parsing at run time.

    When an init nonterminal is matched, we take the "accept" action rather than

    a "reduce" action.

"""

    goal: Nt

# *** Elements ****************************************************************

# Elements are the things that can appear in the .body list of a Production:

# *   Strings represent terminals (see `Grammar.is_terminal`)

# *   `Nt` objects refer to nonterminals.

# *   `Optional` objects represent optional elements.

# *   `LookaheadRule` objects are like lookahead assertions in regular

#     expressions.

# *   The `NoLineTerminatorHere` singleton object can appear between two other

#     symbols to rule out line breaks between them.

# *   `ErrorSymbol` objects never match anything produced by the lexer. Instead

#     they match an ErrorToken that's artificially injected into the token

#     stream at runtime, by the parser itself, just before a token that does

#     not match anything else.

def is_concrete_element(e: Element) -> bool:

    """True if parsing the element `e` produces a value.

    A production's concrete elements can be used in reduce expressions.

"""

    return not isinstance(e, (LookaheadRule, ErrorSymbol, NoLineTerminatorHereClass))

# Nonterminals in the ECMAScript grammar can be parameterized; NtParameter is

# the type of the parameters.

# For example, `BindingIdentifier[?Yield, ?Await]` is represented as

# `Nt('BindingIdentifier', (('Yield', Var('Yield')), ('Await', Var('Await'))))`.

# A nonterminal-parameter-expression is represented by either a Var object or

# the actual value, a boolean. (In theory, parameters don't *have* to be

# boolean; all the code would probably work for anything hashable. In practice,

# all parameters in the ECMAScript grammar are boolean.)

NtParameter = typing.Hashable

class Nt:

    """Nt(name, ((param0, arg0), ...)) - An invocation of a nonterminal.

    Nonterminals are like lambdas. Each nonterminal in a grammar is defined by an

    NtDef which has 0 or more parameters.

    Parameter names `param0...` are strings. The actual arguments `arg0...` are

    NtParameters (see above).

"""

    __slots__ = ['name', 'args']

    name: typing.Union[str, InitNt]

    args: typing.Tuple[typing.Tuple[str, NtParameter], ...]

    def __init__(self,

                 name: typing.Union[str, InitNt],

                 args: typing.Tuple[typing.Tuple[str, NtParameter], ...] = ()):

        self.name = name

        self.args = args

    def __hash__(self) -> int:

        return hash(('nt', self.name, self.args))

    def __eq__(self, other: object) -> bool:

        return (isinstance(other, Nt)

                and (self.name, self.args) == (other.name, other.args))

    def __repr__(self) -> str:

        if self.args:

            return 'Nt({!r}, {!r})'.format(self.name, self.args)

        else:

            return 'Nt({!r})'.format(self.name)

    def pretty(self) -> str:

        """Unique version of this Nt to use in the Python runtime.

        Also used in debug/verbose output.

"""

        def arg_to_str(name: str, value: NtParameter) -> str:

            if value is True:

                return '+' + name

            elif value is False:

                return '~' + name

            elif isinstance(value, Var):

                if value.name == name:

                    return '?' + value.name

                return name + "=" + value.name

            else:

                return name + "=" + repr(value)

        if isinstance(self.name, InitNt):

            return "Start_" + self.name.goal.pretty()

        if len(self.args) == 0:

            return self.name

        return "{}[{}]".format(self.name,

                               ", ".join(arg_to_str(name, value)

                                         for name, value in self.args))

@dataclass(frozen=True)

class Optional:

    """Optional(nt) matches either nothing or the given nt.

    Optional elements are expanded out before states are calculated, so the

    core of the algorithm never sees them.

"""

    inner: Element

@dataclass(frozen=True)

class Literal:

    """Literal(str) matches a sequence of characters.

    Literal elements are sequences of characters which are expected to appear

    verbatim in the input.

"""

    text: str

@dataclass(frozen=True)

class UnicodeCategory:

    """UnicodeCategory(str) matches any character with a category matching

    the cat_prefix.

    UnicodeCategory elements are a set of literal elements which correspond to a

    given unicode cat_prefix.

"""

    cat_prefix: str

@dataclass(frozen=True)

class LookaheadRule:

    """LookaheadRule(set, pos) imposes a lookahead restriction on whatever follows.

    It never consumes any tokens itself. Instead, the right-hand side

    [LookaheadRule(frozenset(['a', 'b']), False), 'Thing']

    matches a Thing that does not start with the token `a` or `b`.

"""

    set: typing.FrozenSet[str]

    positive: bool

# A lookahead restriction really just specifies a set of allowed terminals.

# -   No lookahead restriction at all is equivalent to a rule specifying all terminals.

# -   A positive lookahead restriction explicitly lists all allowed tokens.

# -   A negative lookahead restriction instead specifies the set of all tokens

#     except a few.

def lookahead_contains(rule: typing.Optional[LookaheadRule], t: str) -> bool:

    """True if the given lookahead restriction `rule` allows the terminal `t`."""

    return (rule is None

            or (t in rule.set if rule.positive

                else t not in rule.set))

def lookahead_intersect(

        a: typing.Optional[LookaheadRule],

        b: typing.Optional[LookaheadRule]

) -> typing.Optional[LookaheadRule]:

    """Returns a single rule enforcing both `a` and `b`, allowing only terminals that pass both."""

    if a is None:

        return b

    elif b is None:

        return a

    elif a.positive:

        if b.positive:

            return LookaheadRule(a.set & b.set, True)

        else:

            return LookaheadRule(a.set - b.set, True)

    else:

        if b.positive:

            return LookaheadRule(b.set - a.set, True)

        else:

            return LookaheadRule(a.set | b.set, False)

class NoLineTerminatorHereClass:

    def __str__(self) -> str:

        return 'NoLineTerminatorHere'

NoLineTerminatorHere = NoLineTerminatorHereClass()

# Optional elements. These are expanded out before states are calculated,

# so the core of the algorithm never sees them.

@dataclass(frozen=True)

class Exclude:

    """Exclude(nt1, nt2) matches if nt1 matches and nt2 does not."""

    inner: Element

    exclusion_list: typing.Tuple[Element, ...]

# End. This is used to represent the terminal which is infinitely produced by

# the lexer when input end is reached.

@dataclass(frozen=True)

class End:

    """End() represents the end of the input content."""

# Special grammar symbol that can be consumed to handle a syntax error.

# The error code is passed to an error-handling routine at run time which

# decides if the error is recoverable or not.

@dataclass(frozen=True)

class ErrorSymbol:

    """Special grammar symbol that can be consumed to handle a syntax error."""

    error_code: int

Element = typing.Union[

    str,

    Optional,

    Literal,

    UnicodeCategory,

    Exclude,

Nt,

    LookaheadRule,

    End,

    ErrorSymbol,

    NoLineTerminatorHereClass,

    CallMethod]

@dataclass

class NtDef:

    """Definition of a nonterminal.

    Instances have three attributes:

    .params - Tuple of strings, the names of the parameters.

    .rhs_list - List of Production objects. Arguments to Nt elements in the

    productions can be Var(s) where `s in params`, indicating that parameter

    should be passed through unchanged.

    .type - The type of runtime value produced by parsing an instance of this

    nonterminal, or None.

    An NtDef is a sort of lambda.

    Some langauges have constructs that are allowed or disallowed in particular

    situations. For example, in many languages `return` statements are allowed

    only inside functions or methods. The ECMAScript standard (5.1.5 "Grammar

    Notation") offers this example of the notation it uses to specify this sort

    of thing:

        StatementList [Return] :

            [+Return] ReturnStatement

            ExpressionStatement

    This is an abbreviation for:

        StatementList :

            ExpressionStatement

        StatementList_Return :

            ReturnStatement

            ExpressionStatement

    We offer NtDef.params as a way of representing this in our system.

        "StatementList": NtDef(("Return",), [

            Production(["ReturnStatement"], condition=("Return", True)),

            ["ExpressionStatement"],

        ], None),

    This is an abbreviation for:

        "StatementList_0": [

            ["ExpressionStatement"],

],

        "StatementList_1": [

            ["ReturnStatement"],

            ["ExpressionStatement"],

],

"""

    __slots__ = ['params', 'rhs_list', 'type']

    params: typing.Tuple[str, ...]

    rhs_list: typing.List[Production]

    type: typing.Optional[types.Type]

    def with_rhs_list(self, new_rhs_list: typing.List[Production]) -> NtDef:

        return dataclasses.replace(self, rhs_list=new_rhs_list)

@dataclass(frozen=True)

class Var:

    """Var(name) represents the run-time value of the parameter with the given name."""

    name: str

ReduceExpr = typing.Union[int, CallMethod, None, Some]

ReduceExprOrAccept = typing.Union[ReduceExpr, str]

# The Grammar constructor is very lax about the types you pass to it. It can

# accept a `Dict[str, List[List[str]]]`, for example; it copies the data into

# NtDef and Production objects.

# The `Lenient` types below are the relaxed types that Grammar() actually

# accepts.

LenientNt = typing.Union[Nt, str, InitNt]

LenientProduction = typing.Union[Production, typing.List[Element]]

LenientNtDef = typing.Union[NtDef, typing.List[LenientProduction]]

Source code

Revision control

Copy as Markdown

Other Tools