lr0.py - mozsearch

"""Generate a simple LR0 state graph from a CanonicalGrammar.

The resulting graph may contain inconsistent states, which must be resolved by the

ParseTable before a parser can be generated.

"""

from __future__ import annotations

# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls

import collections

from dataclasses import dataclass

import typing

from .actions import (Accept, Action, CheckNotOnNewLine, FunCall, Lookahead,

                      OutputExpr, Unwind, Reduce, Seq)

from .ordered import OrderedFrozenSet

from .grammar import (CallMethod, Element, End, ErrorSymbol, Grammar,

                      LookaheadRule, NoLineTerminatorHere, Nt, ReduceExpr,

                      ReduceExprOrAccept, Some)

from .rewrites import CanonicalGrammar, Prod

from . import types

# ## LR parsers: Why?

# Consider a single production `expr ::= expr "+" term` being parsed in a

# recursive descent parser. As we read the source left to right, our parser's

# internal state looks like this (marking our place with a dot):

#     expr ::= · expr "+" term

#     expr ::= expr · "+" term

#     expr ::= expr "+" · term

#     expr ::= expr "+" term ·

# As we go, we build an AST. First we parse an *expr* and temporarily set it

# aside. Then we expect to see a `+` operator. Then we parse a *term*. Then,

# having got to the end, we create an AST node for the whole addition

# expression.

# Since the grammar is nested, at run time we really have a stack of these

# intermediate states.

# But how do we decide which production we should be matching? Often the first

# token just tells us: the `while` keyword means there's a `while` statement

# coming up. Grammars in which this is always the case are called LL(1). But

# while it's possible to wrangle *most* of the ES grammar into an LL(1) form,

# not everything works out. For example, here's the ES assignment syntax (much

# simplified):

#     assignment ::= sum

#     assignment ::= primitive "=" assignment

#     sum ::= primitive

#     sum ::= sum "+" primitive

#     primitive ::= VAR

# Note that the bogus assignment `a + b = c` doesn't parse because `a + b`

# isn't a primitive.

# Suppose we want to parse an expression, and the first token is `a`. We don't

# know yet which *assignment* production to use. So this grammar is not in

# LL(1).

# ## LR parsers: How

# An LR parser generator allows for a *superposition* of states. While parsing,

# we can sometimes have multiple productions at once that might match. It's

# like how in quantum theory, Schrödinger’s cat can tentatively be both alive

# and dead until decisive information is observed.

# As we read `a = b + c`, our parser's internal state is like this

# (eliding a few steps, like how we recognize that `a` is a primitive):

#     current point in input  superposed parser state

#     ----------------------  -----------------------

#     · a = b + c             assignment ::= · sum

#                             assignment ::= · primitive "=" assignment

#       (Then, after recognizing that `a` is a *primitive*...)

#     a · = b + c             sum ::= primitive ·

#                             assignment ::= primitive · "=" assignment

#       (The next token, `=`, rules out the first alternative,

#       collapsing the waveform...)

#     a = · b + c             assignment ::= primitive "=" · assignment

#       (After recognizing that `b` is a primitive, we again have options:)

#     a = b · + c             sum ::= primitive ·

#                             assignment ::= primitive · "=" assignment

# And so on. We call each dotted production an "LR item", and the superposition

# of several LR items is called a "state".  (It is not meant to be clear yet

# just *how* the parser knows which rules might match.)

# Since the grammar is nested, at run time we'll have a stack of these parser

# state superpositions.

# The uncertainty in LR parsing means that code for an LR parser written by

# hand, in the style of recursive descent, would read like gibberish. What we

# can do instead is generate a parser table.

@dataclass(frozen=True, order=True)

class LRItem:

    """A snapshot of progress through a single specific production.

    *   `prod_index` identifies the production. (Every production in the grammar

        gets a unique index; see the loop that computes

        prods_with_indexes_by_nt.)

    *   `offset` is the position of the cursor within the production.

    `lookahead` and `followed_by` are two totally different kinds of lookahead.

    *   `lookahead` is the LookaheadRule, if any, that applies to the immediately

        upcoming input. It is present only if this LRItem is subject to a

        `[lookahead]` restriction; otherwise it's None. These restrictions can't

        extend beyond the end of a production, or else the grammar is invalid.

        This implements the lookahead restrictions in the ECMAScript grammar.

        It is not part of any account of LR I've seen.

    *   `followed_by` is a completely different kind of lookahead restriction.

        This is the kind of lookahead that is a central part of canonical LR

        table generation.  It applies to the token *after* the whole current

        production, so `followed_by` always applies to completely different and

        later tokens than `lookahead`.  `followed_by` is a set of terminals; if

        `None` is in this set, it means `END`, not that the LRItem is

        unrestricted.

"""

    prod_index: int

    offset: int

    lookahead: typing.Optional[LookaheadRule]

    followed_by: OrderedFrozenSet[typing.Optional[str]]

# A Term is the label on an edge from one state to another. It's normally a

# terminal, nonterminal, or epsilon action. A state can also have a special

# catchall edge, labeled with an ErrorSymbol.

ShiftedTerm = typing.Union[str, Nt, ErrorSymbol]

Term = typing.Union[ShiftedTerm, Action]

def on_stack(grammar: Grammar, term: Element) -> bool:

    """Returns whether an element of a production is consuming stack space or

    not."""

    if isinstance(term, Nt):

        return True

    elif grammar.is_terminal(term):

        return True

    elif isinstance(term, LookaheadRule):

        return False

    elif isinstance(term, ErrorSymbol):

        return True

    elif isinstance(term, End):

        return True

    elif term is NoLineTerminatorHere:

        # No line terminator is a property of the next token being shifted. It

        # is implemented as an action which once shifted past the next term,

        # will check whether the previous term shifted is on a new line.

        return False

    elif isinstance(term, CallMethod):

        return False

    raise ValueError(term)

def callmethods_to_funcalls(

        expr: ReduceExprOrAccept,

        pop: int,

        ret: str,

        depth: int,

        funcalls: typing.List[Action]

) -> OutputExpr:

    """Lower a reduce-expression to the OutputExpr language.

    CallMethod expressions are replaced with FunCalls; all new FunCalls created

    in this way are appended to `funcalls`.

"""

    if isinstance(expr, int):

        stack_index = pop - expr

        if depth == 0:

            call = FunCall("id", (stack_index,), fallible=False,

                           trait=types.Type("AstBuilder"), set_to=ret)

            funcalls.append(call)

            return ret

        else:

            return stack_index

    elif isinstance(expr, Some):

        res = callmethods_to_funcalls(expr.inner, pop, ret, depth, funcalls)

        # "type: ignore" because Some is not generic, unfortunately.

        return Some(res)  # type: ignore

    elif expr is None:

        return None

    elif isinstance(expr, CallMethod):

        def convert_args(args: typing.Iterable[ReduceExpr]) -> typing.Iterator[OutputExpr]:

            for i, arg in enumerate(args):

                yield callmethods_to_funcalls(arg, pop, ret + "_{}".format(i), depth + 1, funcalls)

        args = tuple(convert_args(expr.args))

        call = FunCall(expr.method, args,

                       trait=expr.trait,

                       fallible=expr.fallible,

                       set_to=ret)

        funcalls.append(call)

        return ret

    elif expr == "accept":

        funcalls.append(Accept())

        return ret

    else:

        raise ValueError(expr)

class LR0Generator:

    """Provide a way to iterate over the grammar, given a set of LR items."""

    __slots__ = [

        "grammar",

        "lr_items",

        "key",

        "_hash",

    grammar: CanonicalGrammar

    lr_items: OrderedFrozenSet[LRItem]

    key: str

    _hash: int

    def __init__(

            self,

            grammar: CanonicalGrammar,

            lr_items: typing.Iterable[LRItem] = ()

    ) -> None:

        self.grammar = grammar

        self.lr_items = OrderedFrozenSet(lr_items)

        # This is used to reuse states which have already been encoded.

        self.key = "".join(repr((item.prod_index, item.offset)) + "\n"

                           for item in sorted(self.lr_items))

        self._hash = hash(self.key)

    def __eq__(self, other: object) -> bool:

        return isinstance(other, LR0Generator) and self.key == other.key

    def __hash__(self) -> int:

        return self._hash

    def __str__(self) -> str:

        s = ""

        for lr_item in self.lr_items:

            s += self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item)

            s += "\n"

        return s

    def stable_locations(self) -> OrderedFrozenSet[str]:

        locations = []

        for lr_item in self.lr_items:

            locations.append(self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item))

        return OrderedFrozenSet(sorted(locations))

    @staticmethod

    def start(grammar: CanonicalGrammar, nt: Nt) -> LR0Generator:

        lr_items: typing.List[LRItem] = []

        # Visit the initial non-terminal, as well as all the non-terminals

        # which are at the left of each productions.

        todo: typing.Deque[Nt] = collections.deque()

        visited_nts = []

        todo.append(nt)

        while todo:

            nt = todo.popleft()

            if nt in visited_nts:

                continue

            visited_nts.append(nt)

            for prod_index, _ in grammar.prods_with_indexes_by_nt[nt]:

                assert isinstance(prod_index, int)

                lr_items.append(LRItem(

                    prod_index=prod_index,

                    offset=0,

                    lookahead=None,

                    followed_by=OrderedFrozenSet(),

))

                prod = grammar.prods[prod_index]

                assert isinstance(prod, Prod)

                try:

                    term = prod.rhs[0]

                    if isinstance(term, Nt):

                        todo.append(term)

                except IndexError:

                    pass

        return LR0Generator(grammar, lr_items)

    def transitions(self) -> typing.Dict[Term, LR0Generator]:

        """Returns the dictionary which maps the state transitions with the next

        LR0Generators. This can be used to generate the states and the

        transitions between the states of an LR0 parse table."""

        followed_by: typing.DefaultDict[Term, typing.List[LRItem]]

        followed_by = collections.defaultdict(list)

        for lr_item in self.lr_items:

            self.item_transitions(lr_item, followed_by)

        return {k: LR0Generator(self.grammar, lr_items)

                for k, lr_items in followed_by.items()}

    def item_transitions(

            self,

            lr_item: LRItem,

            followed_by: typing.DefaultDict[Term, typing.List[LRItem]]

    ) -> None:

        """Given one LRItem, register all the transitions and LR Items reachable

        through these transitions."""

        prod = self.grammar.prods[lr_item.prod_index]

        assert isinstance(prod, Prod)

        # Read the term located at the offset in the production.

        if lr_item.offset < len(prod.rhs):

            term = prod.rhs[lr_item.offset]

            if isinstance(term, Nt):

                pass

            elif self.grammar.grammar.is_terminal(term):

                pass

            elif isinstance(term, LookaheadRule):

                term = Lookahead(term.set, term.positive)

            elif isinstance(term, ErrorSymbol):

                # ErrorSymbol as considered as terminals. These terminals would

                # be produced by the error handling code which produces these

                # error symbols on-demand.

                pass

            elif isinstance(term, End):

                # End is considered as a terminal which is produduced once by

                # the lexer upon reaching the end. However, the parser might

                # finish without consuming the End terminal, if there is no

                # ambiguity on whether the End is expected.

                pass

            elif term is NoLineTerminatorHere:

                # Check whether the following terminal is on a new line. If

                # not, this would produce a syntax error. The argument is the

                # terminal offset.

                term = CheckNotOnNewLine()

            elif isinstance(term, CallMethod):

                funcalls: typing.List[Action] = []

                pop = sum(1 for e in prod.rhs[:lr_item.offset] if on_stack(self.grammar.grammar, e))

                callmethods_to_funcalls(term, pop, "expr", 0, funcalls)

                term = Seq(funcalls)

        elif lr_item.offset == len(prod.rhs):

            # Add the reduce operation as a state transition in the generated

            # parse table. (TODO: this supposed that the canonical form did not

            # move the reduce action to be part of the production)

            pop = sum(1 for e in prod.rhs if on_stack(self.grammar.grammar, e))

            term = Reduce(Unwind(prod.nt, pop))

            expr = prod.reducer

            if expr is not None:

                funcalls = []

                callmethods_to_funcalls(expr, pop, "value", 0, funcalls)

                term = Seq(funcalls + [term])

        else:

            # No edges after the reduce operation.

            return

        # Add terminals, non-terminals and lookahead actions, as transitions to

        # the next LR Item.

        new_transition = term not in followed_by

        followed_by[term].append(LRItem(

            prod_index=lr_item.prod_index,

            offset=lr_item.offset + 1,

            lookahead=None,

            followed_by=OrderedFrozenSet(),

))

        # If the term is a non-terminal, then recursively add transitions from

        # the beginning of all the productions which are matching this

        # non-terminal.

        # Only do it once per non-terminal to avoid infinite recursion on

        # left-recursive grammars.

        if isinstance(term, Nt) and new_transition:

            for prod_index, _ in self.grammar.prods_with_indexes_by_nt[term]:

                assert isinstance(prod_index, int)

                self.item_transitions(LRItem(

                    prod_index=prod_index,

                    offset=0,

                    lookahead=None,

                    followed_by=OrderedFrozenSet(),

                ), followed_by)

Source code

Revision control

Copy as Markdown

Other Tools