Source code

Revision control

Copy as Markdown

Other Tools

"""Generate a simple LR0 state graph from a CanonicalGrammar.
The resulting graph may contain inconsistent states, which must be resolved by the
ParseTable before a parser can be generated.
"""
from __future__ import annotations
# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls
import collections
from dataclasses import dataclass
import typing
from .actions import (Accept, Action, CheckNotOnNewLine, FunCall, Lookahead,
OutputExpr, Unwind, Reduce, Seq)
from .ordered import OrderedFrozenSet
from .grammar import (CallMethod, Element, End, ErrorSymbol, Grammar,
LookaheadRule, NoLineTerminatorHere, Nt, ReduceExpr,
ReduceExprOrAccept, Some)
from .rewrites import CanonicalGrammar, Prod
from . import types
# ## LR parsers: Why?
#
# Consider a single production `expr ::= expr "+" term` being parsed in a
# recursive descent parser. As we read the source left to right, our parser's
# internal state looks like this (marking our place with a dot):
#
# expr ::= · expr "+" term
# expr ::= expr · "+" term
# expr ::= expr "+" · term
# expr ::= expr "+" term ·
#
# As we go, we build an AST. First we parse an *expr* and temporarily set it
# aside. Then we expect to see a `+` operator. Then we parse a *term*. Then,
# having got to the end, we create an AST node for the whole addition
# expression.
#
# Since the grammar is nested, at run time we really have a stack of these
# intermediate states.
#
# But how do we decide which production we should be matching? Often the first
# token just tells us: the `while` keyword means there's a `while` statement
# coming up. Grammars in which this is always the case are called LL(1). But
# while it's possible to wrangle *most* of the ES grammar into an LL(1) form,
# not everything works out. For example, here's the ES assignment syntax (much
# simplified):
#
# assignment ::= sum
# assignment ::= primitive "=" assignment
# sum ::= primitive
# sum ::= sum "+" primitive
# primitive ::= VAR
#
# Note that the bogus assignment `a + b = c` doesn't parse because `a + b`
# isn't a primitive.
#
# Suppose we want to parse an expression, and the first token is `a`. We don't
# know yet which *assignment* production to use. So this grammar is not in
# LL(1).
#
#
# ## LR parsers: How
#
# An LR parser generator allows for a *superposition* of states. While parsing,
# we can sometimes have multiple productions at once that might match. It's
# like how in quantum theory, Schrödinger’s cat can tentatively be both alive
# and dead until decisive information is observed.
#
# As we read `a = b + c`, our parser's internal state is like this
# (eliding a few steps, like how we recognize that `a` is a primitive):
#
# current point in input superposed parser state
# ---------------------- -----------------------
# · a = b + c assignment ::= · sum
# assignment ::= · primitive "=" assignment
#
# (Then, after recognizing that `a` is a *primitive*...)
#
# a · = b + c sum ::= primitive ·
# assignment ::= primitive · "=" assignment
#
# (The next token, `=`, rules out the first alternative,
# collapsing the waveform...)
#
# a = · b + c assignment ::= primitive "=" · assignment
#
# (After recognizing that `b` is a primitive, we again have options:)
#
# a = b · + c sum ::= primitive ·
# assignment ::= primitive · "=" assignment
#
# And so on. We call each dotted production an "LR item", and the superposition
# of several LR items is called a "state". (It is not meant to be clear yet
# just *how* the parser knows which rules might match.)
#
# Since the grammar is nested, at run time we'll have a stack of these parser
# state superpositions.
#
# The uncertainty in LR parsing means that code for an LR parser written by
# hand, in the style of recursive descent, would read like gibberish. What we
# can do instead is generate a parser table.
@dataclass(frozen=True, order=True)
class LRItem:
"""A snapshot of progress through a single specific production.
* `prod_index` identifies the production. (Every production in the grammar
gets a unique index; see the loop that computes
prods_with_indexes_by_nt.)
* `offset` is the position of the cursor within the production.
`lookahead` and `followed_by` are two totally different kinds of lookahead.
* `lookahead` is the LookaheadRule, if any, that applies to the immediately
upcoming input. It is present only if this LRItem is subject to a
`[lookahead]` restriction; otherwise it's None. These restrictions can't
extend beyond the end of a production, or else the grammar is invalid.
This implements the lookahead restrictions in the ECMAScript grammar.
It is not part of any account of LR I've seen.
* `followed_by` is a completely different kind of lookahead restriction.
This is the kind of lookahead that is a central part of canonical LR
table generation. It applies to the token *after* the whole current
production, so `followed_by` always applies to completely different and
later tokens than `lookahead`. `followed_by` is a set of terminals; if
`None` is in this set, it means `END`, not that the LRItem is
unrestricted.
"""
prod_index: int
offset: int
lookahead: typing.Optional[LookaheadRule]
followed_by: OrderedFrozenSet[typing.Optional[str]]
# A Term is the label on an edge from one state to another. It's normally a
# terminal, nonterminal, or epsilon action. A state can also have a special
# catchall edge, labeled with an ErrorSymbol.
ShiftedTerm = typing.Union[str, Nt, ErrorSymbol]
Term = typing.Union[ShiftedTerm, Action]
def on_stack(grammar: Grammar, term: Element) -> bool:
"""Returns whether an element of a production is consuming stack space or
not."""
if isinstance(term, Nt):
return True
elif grammar.is_terminal(term):
return True
elif isinstance(term, LookaheadRule):
return False
elif isinstance(term, ErrorSymbol):
return True
elif isinstance(term, End):
return True
elif term is NoLineTerminatorHere:
# No line terminator is a property of the next token being shifted. It
# is implemented as an action which once shifted past the next term,
# will check whether the previous term shifted is on a new line.
return False
elif isinstance(term, CallMethod):
return False
raise ValueError(term)
def callmethods_to_funcalls(
expr: ReduceExprOrAccept,
pop: int,
ret: str,
depth: int,
funcalls: typing.List[Action]
) -> OutputExpr:
"""Lower a reduce-expression to the OutputExpr language.
CallMethod expressions are replaced with FunCalls; all new FunCalls created
in this way are appended to `funcalls`.
"""
if isinstance(expr, int):
stack_index = pop - expr
if depth == 0:
call = FunCall("id", (stack_index,), fallible=False,
trait=types.Type("AstBuilder"), set_to=ret)
funcalls.append(call)
return ret
else:
return stack_index
elif isinstance(expr, Some):
res = callmethods_to_funcalls(expr.inner, pop, ret, depth, funcalls)
# "type: ignore" because Some is not generic, unfortunately.
return Some(res) # type: ignore
elif expr is None:
return None
elif isinstance(expr, CallMethod):
def convert_args(args: typing.Iterable[ReduceExpr]) -> typing.Iterator[OutputExpr]:
for i, arg in enumerate(args):
yield callmethods_to_funcalls(arg, pop, ret + "_{}".format(i), depth + 1, funcalls)
args = tuple(convert_args(expr.args))
call = FunCall(expr.method, args,
trait=expr.trait,
fallible=expr.fallible,
set_to=ret)
funcalls.append(call)
return ret
elif expr == "accept":
funcalls.append(Accept())
return ret
else:
raise ValueError(expr)
class LR0Generator:
"""Provide a way to iterate over the grammar, given a set of LR items."""
__slots__ = [
"grammar",
"lr_items",
"key",
"_hash",
]
grammar: CanonicalGrammar
lr_items: OrderedFrozenSet[LRItem]
key: str
_hash: int
def __init__(
self,
grammar: CanonicalGrammar,
lr_items: typing.Iterable[LRItem] = ()
) -> None:
self.grammar = grammar
self.lr_items = OrderedFrozenSet(lr_items)
# This is used to reuse states which have already been encoded.
self.key = "".join(repr((item.prod_index, item.offset)) + "\n"
for item in sorted(self.lr_items))
self._hash = hash(self.key)
def __eq__(self, other: object) -> bool:
return isinstance(other, LR0Generator) and self.key == other.key
def __hash__(self) -> int:
return self._hash
def __str__(self) -> str:
s = ""
for lr_item in self.lr_items:
s += self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item)
s += "\n"
return s
def stable_locations(self) -> OrderedFrozenSet[str]:
locations = []
for lr_item in self.lr_items:
locations.append(self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item))
return OrderedFrozenSet(sorted(locations))
@staticmethod
def start(grammar: CanonicalGrammar, nt: Nt) -> LR0Generator:
lr_items: typing.List[LRItem] = []
# Visit the initial non-terminal, as well as all the non-terminals
# which are at the left of each productions.
todo: typing.Deque[Nt] = collections.deque()
visited_nts = []
todo.append(nt)
while todo:
nt = todo.popleft()
if nt in visited_nts:
continue
visited_nts.append(nt)
for prod_index, _ in grammar.prods_with_indexes_by_nt[nt]:
assert isinstance(prod_index, int)
lr_items.append(LRItem(
prod_index=prod_index,
offset=0,
lookahead=None,
followed_by=OrderedFrozenSet(),
))
prod = grammar.prods[prod_index]
assert isinstance(prod, Prod)
try:
term = prod.rhs[0]
if isinstance(term, Nt):
todo.append(term)
except IndexError:
pass
return LR0Generator(grammar, lr_items)
def transitions(self) -> typing.Dict[Term, LR0Generator]:
"""Returns the dictionary which maps the state transitions with the next
LR0Generators. This can be used to generate the states and the
transitions between the states of an LR0 parse table."""
followed_by: typing.DefaultDict[Term, typing.List[LRItem]]
followed_by = collections.defaultdict(list)
for lr_item in self.lr_items:
self.item_transitions(lr_item, followed_by)
return {k: LR0Generator(self.grammar, lr_items)
for k, lr_items in followed_by.items()}
def item_transitions(
self,
lr_item: LRItem,
followed_by: typing.DefaultDict[Term, typing.List[LRItem]]
) -> None:
"""Given one LRItem, register all the transitions and LR Items reachable
through these transitions."""
prod = self.grammar.prods[lr_item.prod_index]
assert isinstance(prod, Prod)
# Read the term located at the offset in the production.
if lr_item.offset < len(prod.rhs):
term = prod.rhs[lr_item.offset]
if isinstance(term, Nt):
pass
elif self.grammar.grammar.is_terminal(term):
pass
elif isinstance(term, LookaheadRule):
term = Lookahead(term.set, term.positive)
elif isinstance(term, ErrorSymbol):
# ErrorSymbol as considered as terminals. These terminals would
# be produced by the error handling code which produces these
# error symbols on-demand.
pass
elif isinstance(term, End):
# End is considered as a terminal which is produduced once by
# the lexer upon reaching the end. However, the parser might
# finish without consuming the End terminal, if there is no
# ambiguity on whether the End is expected.
pass
elif term is NoLineTerminatorHere:
# Check whether the following terminal is on a new line. If
# not, this would produce a syntax error. The argument is the
# terminal offset.
term = CheckNotOnNewLine()
elif isinstance(term, CallMethod):
funcalls: typing.List[Action] = []
pop = sum(1 for e in prod.rhs[:lr_item.offset] if on_stack(self.grammar.grammar, e))
callmethods_to_funcalls(term, pop, "expr", 0, funcalls)
term = Seq(funcalls)
elif lr_item.offset == len(prod.rhs):
# Add the reduce operation as a state transition in the generated
# parse table. (TODO: this supposed that the canonical form did not
# move the reduce action to be part of the production)
pop = sum(1 for e in prod.rhs if on_stack(self.grammar.grammar, e))
term = Reduce(Unwind(prod.nt, pop))
expr = prod.reducer
if expr is not None:
funcalls = []
callmethods_to_funcalls(expr, pop, "value", 0, funcalls)
term = Seq(funcalls + [term])
else:
# No edges after the reduce operation.
return
# Add terminals, non-terminals and lookahead actions, as transitions to
# the next LR Item.
new_transition = term not in followed_by
followed_by[term].append(LRItem(
prod_index=lr_item.prod_index,
offset=lr_item.offset + 1,
lookahead=None,
followed_by=OrderedFrozenSet(),
))
# If the term is a non-terminal, then recursively add transitions from
# the beginning of all the productions which are matching this
# non-terminal.
#
# Only do it once per non-terminal to avoid infinite recursion on
# left-recursive grammars.
if isinstance(term, Nt) and new_transition:
for prod_index, _ in self.grammar.prods_with_indexes_by_nt[term]:
assert isinstance(prod_index, int)
self.item_transitions(LRItem(
prod_index=prod_index,
offset=0,
lookahead=None,
followed_by=OrderedFrozenSet(),
), followed_by)