Source code

Revision control

Copy as Markdown

Other Tools

"""Early-pipeline operations that error-check and lower grammars."""
from __future__ import annotations
# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls
import collections
import dataclasses
from dataclasses import dataclass
import typing
from .grammar import (CallMethod, Element, End, ErrorSymbol, Exclude, Grammar,
LenientNt, Literal, LookaheadRule, Optional,
NoLineTerminatorHere, Nt, NtDef, NtParameter, Production,
ReduceExpr, ReduceExprOrAccept, Some, UnicodeCategory,
Var, is_concrete_element)
from .ordered import OrderedFrozenSet, OrderedSet
from .runtime import ErrorToken, ErrorTokenClass
# *** Checking for errors *****************************************************
T = typing.TypeVar("T")
def fix(f: typing.Callable[[T], T], start: T) -> T:
"""Compute a fixed point of `f`, the hard way, starting from `start`."""
prev, current = start, f(start)
while current != prev:
prev, current = current, f(current)
return current
def empty_nt_set(grammar: Grammar) -> typing.Dict[LenientNt, ReduceExprOrAccept]:
"""Determine which nonterminals in `grammar` can produce the empty string.
Return a dict {nt: expr} that maps each such nonterminal to the expr
that should be evaluated when reducing the empty string to nt.
So, for example, if we have a production
a ::= b? c? => CallMethod("a", [0, 1])
then the resulting dictionary will contain the entry
`("a", CallMethod("a", [None, None]))`.
"""
empties: typing.Dict[LenientNt, ReduceExprOrAccept] = {}
def production_is_empty(p: Production) -> bool:
return all(isinstance(e, LookaheadRule)
or isinstance(e, Optional)
or (isinstance(e, Nt) and e in empties)
or e is NoLineTerminatorHere
for e in p.body)
def evaluate_reducer_with_empty_matches(p: Production) -> ReduceExprOrAccept:
# partial evaluation of p.reducer
stack = [e for e in p.body if is_concrete_element(e)]
Expr = typing.TypeVar("Expr", ReduceExpr, ReduceExprOrAccept)
def eval(expr: Expr) -> Expr:
if expr is None:
return None
elif isinstance(expr, Some):
return Some(eval(expr.inner))
elif isinstance(expr, CallMethod):
return dataclasses.replace(
expr,
args=tuple(eval(arg_expr) for arg_expr in expr.args)
)
elif isinstance(expr, int):
e = stack[expr]
if isinstance(e, Optional):
return None
else:
assert isinstance(e, Nt)
result = empties[e]
assert not isinstance(result, str)
return result
elif expr == 'accept':
# Hmm, this is not ideal! Maybe 'accept' needs to take an
# argument so that the normal case is Accept(0) and this case
# is Accept(eval(expr.args[0])).
return 'accept'
else:
raise TypeError(
"internal error: unhandled reduce expression type {!r}"
.format(expr))
return eval(p.reducer)
done = False
while not done:
done = True
for nt, nt_def in grammar.nonterminals.items():
if nt not in empties:
for p in nt_def.rhs_list:
if production_is_empty(p):
if nt in empties:
raise ValueError(
"ambiguous grammar: multiple productions for "
"{!r} match the empty string"
.format(nt))
done = False
empties[nt] = evaluate_reducer_with_empty_matches(p)
return empties
def check_cycle_free(grammar: Grammar) -> None:
"""Throw an exception if any nonterminal in `grammar` produces itself
via a cycle of 1 or more productions.
"""
empties = empty_nt_set(grammar)
# OK, first find out which nonterminals directly produce which other
# nonterminals (after possibly erasing some optional/empty nts).
direct_produces: typing.Dict[LenientNt, typing.Set[Nt]] = {}
for orig in grammar.nonterminals:
direct_produces[orig] = set()
for source_production in grammar.nonterminals[orig].rhs_list:
for rhs, _r in expand_optional_symbols_in_rhs(source_production.body, grammar, empties):
result: typing.List[Nt] = []
all_possibly_empty_so_far = True
# If we break out of the following loop, that means it turns
# out that this production does not produce *any* strings that
# are just a single nonterminal.
for e in rhs:
if grammar.is_terminal(e):
break # no good, this production contains a terminal
elif isinstance(e, Nt):
if e in empties:
if all_possibly_empty_so_far:
result.append(e)
else:
if not all_possibly_empty_so_far:
# Give up - we have 2+ nonterminals that can't
# be empty.
break
all_possibly_empty_so_far = False
result = [e]
elif isinstance(e, Exclude):
if isinstance(e.inner, Nt):
result.append(e.inner)
elif isinstance(e, LookaheadRule):
# Ignore the restriction. We lose a little precision
# here, and could report a cycle where there isn't one,
# but it's unlikely in real-world grammars.
pass
elif e is NoLineTerminatorHere:
# This doesn't affect the property we're checking.
pass
elif isinstance(e, Literal):
if e.text != "":
# This production contains a non-empty character,
# therefore it cannot correspond to an empty cycle.
break
elif isinstance(e, UnicodeCategory):
# This production consume a class of character,
# therefore it cannot correspond to an empty cycle.
break
elif isinstance(e, End):
# This production consume the End meta-character,
# therefore it cannot correspond to an empty cycle,
# even if this character is expect to be produced
# infinitely once the end is reached.
break
elif isinstance(e, CallMethod):
# This production execute code, but does not consume
# any input.
pass
else:
# Optional is not possible because we called
# expand_optional_symbols_in_rhs. ErrorSymbol
# effectively matches the empty string (though only if
# nothing else matches).
assert isinstance(e, ErrorSymbol)
else:
# If we get here, we didn't break, so our results are good!
# nt can definitely produce all the nonterminals in result.
direct_produces[orig] |= set(result)
def step(
produces: typing.Dict[LenientNt, typing.Set[Nt]]
) -> typing.Dict[LenientNt, typing.Set[Nt]]:
return {
orig: dest | set(b for a in dest for b in produces[a])
for orig, dest in produces.items()
}
produces = fix(step, direct_produces)
for nt in grammar.nonterminals:
if nt in produces[nt]:
raise ValueError(
"invalid grammar: nonterminal {} can produce itself"
.format(nt))
def check_lookahead_rules(grammar: Grammar) -> None:
"""Check that no LookaheadRule appears at the end of a production (or before
elements that can produce the empty string).
If there are any offending lookahead rules, throw a ValueError.
"""
empties = empty_nt_set(grammar)
check_cycle_free(grammar)
for nt in grammar.nonterminals:
for source_production in grammar.nonterminals[nt].rhs_list:
body = source_production.body
for rhs, _r in expand_optional_symbols_in_rhs(body, grammar, empties):
# XXX BUG: The next if-condition is insufficient, since it
# fails to detect a lookahead restriction followed by a
# nonterminal that can match the empty string.
if rhs and isinstance(rhs[-1], LookaheadRule):
raise ValueError(
"invalid grammar: lookahead restriction "
"at end of production: {}"
.format(grammar.production_to_str(nt, body)))
def check_no_line_terminator_here(grammar: Grammar) -> None:
empties = empty_nt_set(grammar)
def check(e: Element, nt: LenientNt, body: typing.List[Element]) -> None:
if grammar.is_terminal(e):
pass
elif isinstance(e, Nt):
if e in empties:
raise ValueError(
"invalid grammar: [no LineTerminator here] cannot appear next to "
"a nonterminal that matches the empty string\n"
"in production: {}".format(grammar.production_to_str(nt, body)))
else:
raise ValueError(
"invalid grammar: [no LineTerminator here] must appear only "
"between terminals and/or nonterminals\n"
"in production: {}".format(grammar.production_to_str(nt, body)))
for nt in grammar.nonterminals:
for production in grammar.nonterminals[nt].rhs_list:
body = production.body
for i, e in enumerate(body):
if e is NoLineTerminatorHere:
if i == 0 or i == len(body) - 1:
raise ValueError(
"invalid grammar: [no LineTerminator here] must be between two symbols\n"
"in production: {}".format(grammar.production_to_str(nt, body)))
check(body[i - 1], nt, body)
check(body[i + 1], nt, body)
def expand_parameterized_nonterminals(grammar: Grammar) -> Grammar:
"""Replace parameterized nonterminals with specialized copies.
For example, a single pair `nt_name: NtDef(params=('A', 'B'), ...)` in
`grammar.nonterminals` will be replaced with (assuming A and B are boolean
parameters) up to four pairs, each having an Nt object as the key and an
NtDef with no parameters as the value.
`grammar.nonterminals` must have string keys.
Returns a new copy of `grammar` with Nt keys, whose NtDefs all have
`nt_def.params == []`.
"""
todo = collections.deque(grammar.goals())
new_nonterminals = {}
def expand(nt: Nt) -> NtDef:
"""Expand grammar.nonterminals[nt](**args).
Returns the expanded NtDef, which contains no conditional
productions or Nt objects.
"""
if nt.args is None:
args_dict = None
else:
args_dict = dict(nt.args)
def evaluate_arg(arg: NtParameter) -> NtParameter:
if isinstance(arg, Var):
return args_dict[arg.name]
else:
return arg
def expand_element(e: Element) -> Element:
if isinstance(e, Optional):
return Optional(expand_element(e.inner))
elif isinstance(e, Exclude):
return Exclude(expand_element(e.inner), tuple(map(expand_element, e.exclusion_list)))
elif isinstance(e, Nt):
args = tuple((name, evaluate_arg(arg))
for name, arg in e.args)
e = Nt(e.name, args)
if e not in new_nonterminals:
todo.append(e)
return e
else:
return e
def expand_production(p: Production) -> Production:
return p.copy_with(
body=[expand_element(e) for e in p.body],
condition=None)
def expand_productions(nt_def: NtDef) -> NtDef:
result = []
for p in nt_def.rhs_list:
if p.condition is None:
included = True
else:
param, value = p.condition
included = (args_dict[param] == value)
if included:
result.append(expand_production(p))
return NtDef((), result, nt_def.type)
nt_def = grammar.nonterminals[nt.name]
assert tuple(name for name, value in nt.args) == nt_def.params
return expand_productions(nt_def)
while todo:
nt = todo.popleft()
if nt not in new_nonterminals: # not already expanded
new_nonterminals[nt] = expand(nt)
# "type: ignore" because this runs afoul of Python's covariance rules for
# Mapping; but it conforms to the intended type.
return grammar.with_nonterminals(new_nonterminals) # type: ignore
# *** Start sets and follow sets **********************************************
EMPTY = "(empty)"
END = None
TerminalOrEmpty = str
TerminalOrEmptyOrErrorToken = typing.Union[str, ErrorTokenClass]
StartSets = typing.Dict[Nt, OrderedFrozenSet[TerminalOrEmptyOrErrorToken]]
def start_sets(grammar: Grammar) -> StartSets:
"""Compute the start sets for nonterminals in a grammar.
A nonterminal's start set is the set of tokens that a match for that
nonterminal may start with, plus EMPTY if it can match the empty string
and ErrorToken if it can start with an error.
"""
# How this works: Note that we can replace the words "match" and "start
# with" in the definition above with more queries about start sets.
#
# 1. A nonterminal's start set contains a terminal `t` if any of its
# productions contains either `t` or a nonterminal with `t` in *its*
# start set, preceded only by zero or more nonterminals that have EMPTY
# in *their* start sets. Plus:
#
# 2. A nonterminal's start set contains EMPTY if any of its productions
# consists entirely of nonterminals that have EMPTY in *their* start
# sets.
#
# This definition is rather circular. We want the smallest collection of
# start sets satisfying these rules, and we get that by iterating to a
# fixed point.
assert all(isinstance(nt, Nt) for nt in grammar.nonterminals)
start: StartSets
start = {typing.cast(Nt, nt): OrderedFrozenSet() for nt in grammar.nonterminals}
done = False
while not done:
done = True
for nt, nt_def in grammar.nonterminals.items():
assert isinstance(nt, Nt)
# Compute start set for each `prod` based on `start` so far.
# Could be incomplete, but we'll ratchet up as we iterate.
nt_start = OrderedFrozenSet(
t for p in nt_def.rhs_list for t in seq_start(grammar, start, p.body))
if nt_start != start[nt]:
start[nt] = nt_start
done = False
return start
def seq_start(
grammar: Grammar,
start: StartSets,
seq: typing.List[Element]
) -> OrderedFrozenSet[TerminalOrEmptyOrErrorToken]:
"""Compute the start set for a sequence of elements."""
s: OrderedSet[TerminalOrEmptyOrErrorToken] = OrderedSet([EMPTY])
for i, e in enumerate(seq):
if EMPTY not in s: # preceding elements never match the empty string
break
s.remove(EMPTY)
if grammar.is_terminal(e):
assert isinstance(e, str)
s.add(e)
elif isinstance(e, ErrorSymbol):
s.add(ErrorToken)
elif isinstance(e, Nt):
s |= start[e]
elif e is NoLineTerminatorHere:
s.add(EMPTY)
else:
assert isinstance(e, LookaheadRule)
future = seq_start(grammar, start, seq[i + 1:])
if e.positive:
future &= e.set
else:
future -= e.set
return OrderedFrozenSet(future)
return OrderedFrozenSet(s)
StartSetCache = typing.List[typing.List[OrderedFrozenSet[TerminalOrEmptyOrErrorToken]]]
def make_start_set_cache(
grammar: Grammar,
prods: typing.List[Prod],
start: StartSets
) -> StartSetCache:
"""Compute start sets for all suffixes of productions in the grammar.
Returns a list of lists `cache` such that
`cache[n][i] == seq_start(grammar, start, prods[n][i:])`.
(The cache is for speed, since seq_start was being called millions of
times.)
"""
def suffix_start_list(
rhs: typing.List[Element]
) -> typing.List[OrderedFrozenSet[TerminalOrEmptyOrErrorToken]]:
sets: typing.List[OrderedFrozenSet[TerminalOrEmptyOrErrorToken]]
sets = [OrderedFrozenSet([EMPTY])]
for e in reversed(rhs):
s: OrderedFrozenSet[TerminalOrEmptyOrErrorToken]
if grammar.is_terminal(e):
assert isinstance(e, str)
s = OrderedFrozenSet([e])
elif isinstance(e, ErrorSymbol):
s = OrderedFrozenSet([ErrorToken])
elif isinstance(e, Nt):
s = start[e]
if EMPTY in s:
s = OrderedFrozenSet((s - {EMPTY}) | sets[-1])
elif e is NoLineTerminatorHere:
s = sets[-1]
else:
assert isinstance(e, LookaheadRule)
if e.positive:
s = OrderedFrozenSet(sets[-1] & e.set)
else:
s = OrderedFrozenSet(sets[-1] - e.set)
assert isinstance(s, OrderedFrozenSet)
assert s == seq_start(grammar, start, rhs[len(rhs) - len(sets):])
sets.append(s)
sets.reverse()
assert sets == [seq_start(grammar, start, rhs[i:])
for i in range(len(rhs) + 1)]
return sets
return [suffix_start_list(prod.rhs) for prod in prods]
FollowSet = OrderedSet[typing.Union[TerminalOrEmptyOrErrorToken, None]]
FollowSets = typing.DefaultDict[Nt, FollowSet]
def follow_sets(
grammar: Grammar,
prods_with_indexes_by_nt: typing.DefaultDict[
LenientNt,
typing.List[typing.Tuple[int, typing.List[Element]]]
],
start_set_cache: StartSetCache
) -> FollowSets:
"""Compute all follow sets for nonterminals in a grammar.
The follow set for a nonterminal `A`, as defined in the book, is "the set
of terminals that can appear immediately to the right of `A` in some
sentential form"; plus, "If `A` can be the rightmost symbol in some
sentential form, then $ is in FOLLOW(A)."
Returns a default-dictionary mapping nts to follow sets.
"""
# Set of nonterminals already seen, including those we are in the middle of
# analyzing. The algorithm starts at `goal` and walks all reachable
# nonterminals, recursively.
visited = set()
# The results. By definition, nonterminals that are not reachable from the
# goal nt have empty follow sets.
follow: FollowSets = collections.defaultdict(OrderedSet)
# If `(x, y) in subsumes_relation`, then x can appear at the end of a
# production of y, and therefore follow[x] should be <= follow[y].
# (We could maintain that invariant throughout, but at present we
# brute-force iterate to a fixed point at the end.)
subsumes_relation: OrderedSet[typing.Tuple[Nt, Nt]]
subsumes_relation = OrderedSet()
# `END` is $. It is, of course, in follow[each goal nonterminal]. It gets
# into other nonterminals' follow sets through the subsumes relation.
for init_nt in grammar.init_nts:
assert isinstance(init_nt, Nt)
follow[init_nt].add(END)
def visit(nt: Nt) -> None:
if nt in visited:
return
visited.add(nt)
for prod_index, rhs in prods_with_indexes_by_nt[nt]:
for i, symbol in enumerate(rhs):
if isinstance(symbol, Nt):
visit(symbol)
after = start_set_cache[prod_index][i + 1]
if EMPTY in after:
after -= {EMPTY}
subsumes_relation.add((symbol, nt))
follow[symbol] |= after
for nt in grammar.init_nts:
assert isinstance(nt, Nt)
visit(nt)
# Now iterate to a fixed point on the subsumes relation.
done = False
while not done:
done = True # optimistically
for target, source in subsumes_relation:
if follow[source] - follow[target]:
follow[target] |= follow[source]
done = False
return follow
# *** Lowering ****************************************************************
# At this point, lowered productions start getting farther from the original
# source. We need to associate them with the original grammar in order to
# produce correct output, so we use Prod values to represent productions.
#
# - `nt` is the name of the nonterminal as it appears in the original
# grammar.
#
# - `index` is the index of the source production, within nt's productions,
# in the original grammar.
#
# - `rhs` is the fully lowered/expanded right-hand-side of the production.
#
# There may be many productions in a grammar that all have the same `nt` and
# `index` because they were all produced from the same source production.
@dataclass
class Prod:
nt: Nt
index: int
rhs: typing.List
reducer: ReduceExprOrAccept
def expand_optional_symbols_in_rhs(
rhs: typing.List[Element],
grammar: Grammar,
empties: typing.Dict[LenientNt, ReduceExprOrAccept],
start_index: int = 0
) -> typing.Iterable[typing.Tuple[typing.List[Element], typing.Dict[int, ReduceExpr]]]:
"""Expand a sequence with optional symbols into sequences that have none.
rhs is a list of symbols, possibly containing optional elements. This
yields every list that can be made by replacing each optional element
either with its .inner value, or with nothing.
Each list is accompanied by the list of the indices of optional elements in
`rhs` that were dropped.
For example, `expand_optional_symbols_in_rhs(["if", Optional("else")])`
yields the two pairs `(["if"], [1])` and `["if", "else"], []`.
"""
replacement: ReduceExpr
for i in range(start_index, len(rhs)):
e = rhs[i]
if isinstance(e, Optional):
if isinstance(e.inner, Nt) and e.inner in empties:
# If this is already possibly-empty in the input grammar, it's an
# error! The grammar is ambiguous.
raise ValueError(
"ambiguous grammar: {} is ambiguous because {} can match "
"the empty string"
.format(grammar.element_to_str(e),
grammar.element_to_str(e.inner)))
replacement = None
break
elif isinstance(e, Nt) and e in empties:
empty_expr = empties[e]
# The replacement can't be 'accept' because that only happens with
# InitNt nonterminals, which are never used in productions.
assert not isinstance(empty_expr, str)
replacement = empty_expr
break
else:
yield rhs[start_index:], {}
return
for expanded, r in expand_optional_symbols_in_rhs(rhs, grammar, empties, i + 1):
e = rhs[i]
rhs_inner = e.inner if isinstance(e, Optional) else e
# without rhs[i]
r2 = r.copy()
r2[i] = replacement
yield rhs[start_index:i] + expanded, r2
# with rhs[i]
yield rhs[start_index:i] + [rhs_inner] + expanded, r
def expand_all_optional_elements(grammar: Grammar) -> typing.Tuple[
Grammar,
typing.List[Prod],
typing.DefaultDict[LenientNt, typing.List[typing.Tuple[int, typing.List[Element]]]]
]:
"""Expand optional elements in the grammar.
We replace each production that contains an optional element with two
productions: one with and one without. Downstream of this step, we can
ignore the possibility of optional elements.
"""
expanded_grammar: typing.Dict[LenientNt, NtDef] = {}
# This was capturing the set of empty production to simplify the work of
# the previous algorithm which was trying to determine the lookahead.
# However, with the LR0Generator this is no longer needed as we are
# generating deliberatly inconsistent parse table states, which are then
# properly fixed by adding lookahead information where needed, and without
# bugs!
# empties = empty_nt_set(grammar)
empties: typing.Dict[LenientNt, ReduceExprOrAccept] = {}
# Put all the productions in one big list, so each one has an index. We
# will use the indices in the action table (as reduce action payloads).
prods: typing.List[Prod] = []
prods_with_indexes_by_nt: \
typing.DefaultDict[LenientNt, typing.List[typing.Tuple[int, typing.List[Element]]]] = \
collections.defaultdict(list)
for nt, nt_def in grammar.nonterminals.items():
assert isinstance(nt, Nt)
prods_expanded = []
for prod_index, p in enumerate(nt_def.rhs_list):
# Aggravatingly, a reduce-expression that's an int is not
# simply an offset into p.body. It only counts "concrete"
# elements. Make a mapping for converting reduce-expressions to
# offsets.
reduce_expr_to_offset = [
i
for i, e in enumerate(p.body)
if is_concrete_element(e)
]
for pair in expand_optional_symbols_in_rhs(p.body, grammar, empties):
expanded_rhs, removals = pair
Expr = typing.TypeVar("Expr", ReduceExpr, ReduceExprOrAccept)
def adjust_reduce_expr(expr: Expr) -> Expr:
if isinstance(expr, int):
i = reduce_expr_to_offset[expr]
if i in removals:
return removals[i]
was_optional = isinstance(p.body[i], Optional)
expr -= sum(1 for r in removals if r < i)
if was_optional:
return Some(expr)
else:
return expr
elif expr is None:
return None
elif isinstance(expr, Some):
return Some(adjust_reduce_expr(expr.inner))
elif isinstance(expr, CallMethod):
return dataclasses.replace(
expr,
args=tuple(adjust_reduce_expr(arg)
for arg in expr.args)
)
elif expr == 'accept':
# doesn't need to be adjusted because 'accept' isn't
# turned into code downstream.
return 'accept'
else:
raise TypeError(
"internal error: unrecognized element {!r}"
.format(expr))
adjusted_reducer = adjust_reduce_expr(p.reducer)
prods_expanded.append(
Production(body=expanded_rhs,
reducer=adjusted_reducer))
prods.append(Prod(nt, prod_index, expanded_rhs,
adjusted_reducer))
prods_with_indexes_by_nt[nt].append(
(len(prods) - 1, expanded_rhs))
expanded_grammar[nt] = nt_def.with_rhs_list(prods_expanded)
return (grammar.with_nonterminals(expanded_grammar),
prods,
prods_with_indexes_by_nt)
class CanonicalGrammar:
__slots__ = ["prods", "prods_with_indexes_by_nt", "grammar"]
prods: typing.List[Prod]
prods_with_indexes_by_nt: typing.Mapping[
LenientNt,
typing.List[typing.Tuple[int, typing.List[Element]]]]
grammar: Grammar
def __init__(self, grammar: Grammar) -> None:
# Step by step, we check the grammar and lower it to a more primitive form.
grammar = expand_parameterized_nonterminals(grammar)
check_cycle_free(grammar)
# check_lookahead_rules(grammar)
check_no_line_terminator_here(grammar)
grammar, prods, prods_with_indexes_by_nt = \
expand_all_optional_elements(grammar)
self.prods = prods
self.prods_with_indexes_by_nt = prods_with_indexes_by_nt
self.grammar = grammar