Source code

Revision control

Copy as Markdown

Other Tools

""" Data structures for representing grammars. """
from __future__ import annotations
# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls
import copy
import typing
import dataclasses
from dataclasses import dataclass
from .ordered import OrderedSet, OrderedFrozenSet
from . import types
# *** What is a grammar? ******************************************************
#
# A grammar is a dictionary mapping nonterminal names to lists of right-hand
# sides. Each right-hand side (also called a "production") is a list whose
# elements can include terminals, nonterminals, Optional elements, LookaheadRules,
# and NoLineTerminatorHere.
#
# The most common elements are terminals and nonterminals, so a grammar usually
# looks something like this:
def example_grammar() -> Grammar:
rules: typing.Dict[typing.Union[str, InitNt, Nt], LenientNtDef] = {
'expr': [
['term'],
['expr', '+', 'term'],
['expr', '-', 'term'],
],
'term': [
['unary'],
['term', '*', 'unary'],
['term', '/', 'unary'],
],
'unary': [
['prim'],
['-', 'unary'],
],
'prim': [
['NUM'],
['VAR'],
['(', 'expr', ')'],
],
}
# The goal nonterminals are the nonterminals we're actually interested in
# parsing. Here we want to parse expressions; all the other nonterminals
# are interesting only as the building blocks of expressions.
#
# Variable terminals are terminal symbols that can have several different
# values, like a VAR token that could be any identifier, or a NUM token
# that could be any number.
return Grammar(rules, goal_nts=['expr'], variable_terminals=['NUM', 'VAR'])
Condition = typing.Tuple[str, bool]
# A production consists of a left side, an optional condition, a right side,
# and a reducer. A `Production` object includes everything except the left
# side. Incorporating reducers lets us transform a grammar while preserving
# behavior.
#
# The production `expr ::= term` is represented by
# `Production(["term"], 0)`.
#
# The production `expr ::= expr + term => add($0, $2)` is represented by
# `Production(["expr", "+", "term"], CallMethod("add", (0, 2))`.
#
@dataclass
class Production:
__slots__ = ['body', 'reducer', 'condition']
body: typing.List[Element]
reducer: ReduceExprOrAccept
condition: typing.Optional[Condition]
def __init__(self,
body: typing.List[Element],
reducer: ReduceExprOrAccept,
*,
condition: typing.Optional[Condition] = None):
self.body = body
self.reducer = reducer
self.condition = condition
def __repr__(self) -> str:
if self.condition is None:
return "Production({!r}, reducer={!r})".format(self.body, self.reducer)
else:
return ("Production({!r}, reducer={!r}, condition={!r})"
.format(self.body, self.reducer, self.condition))
def copy_with(self, **kwargs: typing.Any) -> Production:
return dataclasses.replace(self, **kwargs)
# *** Reduce expressions ******************************************************
#
# Reduce expressions ("reducers" for short) are a little language used to
# specify what happens when a production is matched. A reduce expression is
# one of these:
#
# * An integer evaluates to one of the syntactic components of the
# production. For example, if the production we're reducing is
# `sum ::= sum + term`, the integer `0` evaluates the `sum` to the left of
# the plus sign, and `2` means the `term` on the right. (The integer
# `1` would refer to the `+` token itself, but that's not super useful.)
#
# These integers are not quite indexes into `production.body`, because
# LookaheadRule, ErrorSymbol, and NoLineTerminatorHere elements don't
# count: in the production `stmt ::= [lookahead != "let"] expr ";"`, `0` is
# the expr, and `1` is the semicolon token. See `is_concrete_element(e)`.
#
# * CallMethod objects pass values to a builder method and return the result.
# The `args` are nested reduce expressions.
#
# * None is an expression used as a placeholder when an optional symbol is
# omitted.
#
# * Some(expr) is used when an optional symbol is found and parsed.
# In Python, this just expands to the same thing as `expr`, but in Rust
# this expands to a use of `Option::Some()`.
#
# In addition, the special reducer 'accept' means stop parsing. This is
# used only in productions for init nonterminals, created automatically by
# Grammar.__init__(). It's not a reduce expression, so it can't be nested.
@dataclass(frozen=True)
class CallMethod:
"""Express a method call, and give it a given set of arguments. A trait is
added as the parser should implement this trait to call this method."""
method: str
args: typing.Tuple[ReduceExpr, ...]
trait: types.Type = types.Type("AstBuilder")
fallible: bool = False
@dataclass(frozen=True)
class Some:
inner: ReduceExpr
def expr_to_str(expr: ReduceExprOrAccept) -> str:
if isinstance(expr, int):
return "${}".format(expr)
elif isinstance(expr, CallMethod):
return "{}::{}({}){}".format(
expr.trait, expr.method,
', '.join(expr_to_str(arg) for arg in expr.args),
expr.fallible and '?' or '')
elif expr is None:
return "None"
elif isinstance(expr, Some):
return "Some({})".format(expr_to_str(expr.inner))
elif expr == "accept":
return "<accept>"
else:
raise ValueError("unrecognized expression: {!r}".format(expr))
# Type parameter for Grammar.intern().
Internable = typing.TypeVar("Internable")
SyntheticTerminalsDict = typing.Dict[str, OrderedFrozenSet[str]]
class Grammar:
"""A collection of productions.
* self.variable_terminals - OrderedFrozenSet[str] - Terminals that carry
data, like (in JS) numeric literals and RegExps.
* self.terminals - OrderedFrozenSet[str] - All terminals used in the
language, including those in self.variable_terminals and
self.synthetic_terminals.
* self.synthetic_terminals - {str: OrderedFrozenSet[str]} - Maps terminals
to sets of terminals. An entry `name: set` in this dictionary means
that `name` is shorthand for "one of the terminals in `set`".
* self.nonterminals - {LenientNt: NtDef} - Keys are either (str|InitNt), early
in the pipeline, or Nt objects later on. Values are the NtDef objects
that contain the actual Productions.
* self.methods - {str: MethodType} - Type information for methods.
* self.init_nts - [InitNt or Nt] - The list of all elements of
self.nonterminals.keys() that are init nonterminals.
* self.exec_modes - DefaultDict{str, OrderedSet[Type]} or None - ?
* self.type_to_mods - {Type: [str]} or None - ?
* self._cache - {Any: Any} - Cache of immutable objects used by
Grammar.intern().
"""
variable_terminals: OrderedFrozenSet[str]
terminals: OrderedFrozenSet[typing.Union[str, End]]
synthetic_terminals: SyntheticTerminalsDict
nonterminals: typing.Dict[LenientNt, NtDef]
methods: typing.Dict[str, types.MethodType]
init_nts: typing.List[typing.Union[Nt, InitNt]]
exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]]
type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]]
_cache: typing.Dict[typing.Any, typing.Any]
def __init__(
self,
nonterminals: typing.Mapping[LenientNt, LenientNtDef],
*,
goal_nts: typing.Optional[typing.Iterable[LenientNt]] = None,
variable_terminals: typing.Iterable[str] = (),
synthetic_terminals: SyntheticTerminalsDict = None,
method_types: typing.Optional[typing.Dict[str, types.MethodType]] = None,
exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] = None,
type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] = None):
# This constructor supports passing in a sort of jumbled blob of
# strings, lists, and actual objects, and normalizes it all to a more
# typeful structure. Being able to interpret simple
# "list-of-lists-of-strings" input is super useful for tests.
#
# We don't check here that the grammar is LR, that it's cycle-free, or
# any other nice properties.
# Copy/infer the arguments.
my_nonterminals: typing.Dict[LenientNt, LenientNtDef] = dict(nonterminals.items())
if goal_nts is None:
# Default to the first nonterminal in the dictionary.
my_goal_nts = []
for name in my_nonterminals:
my_goal_nts.append(name)
break
else:
my_goal_nts = list(goal_nts)
self.variable_terminals = OrderedFrozenSet(variable_terminals)
if synthetic_terminals is None:
synthetic_terminals = {}
else:
synthetic_terminals = {
name: OrderedFrozenSet(set)
for name, set in synthetic_terminals.items()
}
for synthetic_key, values in synthetic_terminals.items():
if synthetic_key in my_nonterminals:
raise ValueError(
"invalid grammar: {!r} is both a terminal and a nonterminal"
.format(synthetic_key))
for t in values:
if t in my_nonterminals:
raise ValueError(
"invalid grammar: {!r} is both ".format(t)
+ "a representation of a synthetic terminal and a nonterminal")
if t in synthetic_terminals:
# Nested synthetic terminals. Throw, for now. (This
# should pretty much just work, except expand_terminal
# doesn't support it; and we don't check for cycles
# where a synthetic terminal includes itself directly
# or indirectly.)
raise ValueError(
"unsupported: synthetic terminals can't include other "
"synthetic terminals; {!r} includes {!r}"
.format(synthetic_key, t))
# self.synthetic_terminals = synthetic_terminals
self.synthetic_terminals = {}
keys_are_nt = isinstance(next(iter(my_nonterminals)), Nt)
key_type: typing.Union[typing.Type, typing.Tuple[typing.Type, ...]]
key_type = Nt if keys_are_nt else (str, InitNt)
self._cache = {}
# Gather some information just by looking at keys (without examining
# every production).
#
# str_to_nt maps the name of each non-parameterized
# nonterminal to `Nt(name)`, a cache.
str_to_nt: typing.Dict[typing.Union[str, InitNt], Nt] = {}
# nt_params lists the names of each nonterminal's parameters (empty
# tuple for non-parameterized nts).
nt_params: typing.Dict[typing.Union[str, InitNt], typing.Tuple[str, ...]] = {}
for key in my_nonterminals:
if not isinstance(key, key_type):
raise ValueError(
"invalid grammar: conflicting key types in nonterminals dict - "
"expected either all str or all Nt, got {!r}"
.format(key.__class__.__name__))
nt_name: typing.Union[str, InitNt]
param_names: typing.Tuple[str, ...]
if keys_are_nt:
assert isinstance(key, Nt)
nt_name = key.name
param_names = tuple(name for name, value in key.args)
else:
assert isinstance(key, (str, InitNt))
nt_name = key
param_names = ()
my_nt = my_nonterminals[key]
if isinstance(my_nt, NtDef):
param_names = tuple(my_nt.params)
if nt_name not in nt_params:
nt_params[nt_name] = param_names
else:
if nt_params[nt_name] != param_names:
raise ValueError(
"conflicting parameter name lists for nt {!r}: "
"both {!r} and {!r}"
.format(nt_name, nt_params[nt_name], param_names))
if param_names == () and nt_name not in str_to_nt:
str_to_nt[nt_name] = self.intern(Nt(nt_name))
# Validate, desugar, and copy the grammar. As a side effect, calling
# validate_element on every element of the grammar populates
# all_terminals.
all_terminals: OrderedSet[typing.Union[str, End]] = OrderedSet(self.variable_terminals)
all_terminals.add(End())
def note_terminal(t: str) -> None:
"""Add t (and all representations of it, if synthetic) to all_terminals."""
if t not in all_terminals:
all_terminals.add(t)
if t in self.synthetic_terminals:
for k in self.synthetic_terminals[t]:
note_terminal(k)
# Note: i and j are normally list indexes, but they are sometimes the
# special string '?'. It's OK because they are used only in error
# messages.
def validate_element(
nt: LenientNt,
i: typing.Union[int, str],
j: typing.Union[int, str],
e: Element,
context_params: typing.Tuple[str, ...]
) -> Element:
if isinstance(e, str):
if e in nt_params:
if nt_params[e] != ():
raise ValueError(
"invalid grammar: missing parameters for {!r} "
"in production `grammar[{!r}][{}][{}].inner`: {!r}"
.format(e, nt, i, j, nt_params[e]))
return str_to_nt[e]
else:
note_terminal(e)
return e
elif isinstance(e, Optional):
if not isinstance(e.inner, (str, Nt)):
raise TypeError(
"invalid grammar: unrecognized element "
"in production `grammar[{!r}][{}][{}].inner`: {!r}"
.format(nt, i, j, e.inner))
inner = validate_element(nt, i, j, e.inner, context_params)
return self.intern(Optional(inner))
elif isinstance(e, Literal):
if not isinstance(e.text, str):
raise TypeError(
"invalid grammar: unrecognized element "
"in production `grammar[{!r}][{}][{}].text`: {!r}"
.format(nt, i, j, e.text))
return self.intern(e)
elif isinstance(e, UnicodeCategory):
if not isinstance(e.cat_prefix, str):
raise TypeError(
"invalid grammar: unrecognized element "
"in production `grammar[{!r}][{}][{}].cat_prefix`: {!r}"
.format(nt, i, j, e.cat_prefix))
return self.intern(e)
elif isinstance(e, Exclude):
if not isinstance(e.inner, (str, Nt)):
raise TypeError(
"invalid grammar: unrecognized element "
"in production `grammar[{!r}][{}][{}].inner`: {!r}"
.format(nt, i, j, e.inner))
exclusion_list = []
for value in e.exclusion_list:
if not isinstance(value, (str, Nt)):
raise TypeError(
"invalid grammar: unrecognized element "
"in production `grammar[{!r}][{}][{}].exclusion_list`: {!r}"
.format(nt, i, j, value))
value = validate_element(nt, i, j, value, context_params)
exclusion_list.append(value)
inner = validate_element(nt, i, j, e.inner, context_params)
return self.intern(Exclude(inner, tuple(exclusion_list)))
elif isinstance(e, Nt):
# Either the application or the original parameterized
# production must be present in the dictionary.
if e not in my_nonterminals and e.name not in my_nonterminals:
raise ValueError(
"invalid grammar: unrecognized nonterminal "
"in production `grammar[{!r}][{}][{}]`: {!r}"
.format(nt, i, j, e.name))
args = tuple(pair[0] for pair in e.args)
if e.name in nt_params and args != nt_params[e.name]:
raise ValueError(
"invalid grammar: wrong arguments passed to {!r} "
"in production `grammar[{!r}][{}][{}]`: "
"passed {!r}, expected {!r}"
.format(e.name, nt, i, j,
args, nt_params[e.name]))
for param_name, arg_expr in e.args:
if isinstance(arg_expr, Var):
if arg_expr.name not in context_params:
raise ValueError(
"invalid grammar: undefined variable {!r} "
"in production `grammar[{!r}][{}][{}]`"
.format(arg_expr.name, nt, i, j))
return self.intern(e)
elif isinstance(e, (LookaheadRule, End, ErrorSymbol)):
return self.intern(e)
elif e is NoLineTerminatorHere:
return e
elif isinstance(e, CallMethod):
return self.intern(e)
else:
raise TypeError(
"invalid grammar: unrecognized element in production "
"`grammar[{!r}][{}][{}]`: {!r}"
.format(nt, i, j, e))
assert False, "unreachable"
def check_reduce_expr(
nt: LenientNt,
i: int,
rhs: Production,
expr: ReduceExprOrAccept) -> None:
if isinstance(expr, int):
concrete_len = sum(1 for e in rhs.body
if is_concrete_element(e))
if not (0 <= expr < concrete_len):
raise ValueError(
"invalid grammar: element number {} out of range for "
"production {!r} in grammar[{!r}][{}].reducer ({!r})"
.format(expr, nt, rhs.body, i, rhs.reducer))
elif isinstance(expr, CallMethod):
if not isinstance(expr.method, str):
raise TypeError(
"invalid grammar: method names must be strings, "
"not {!r}, in grammar[{!r}[{}].reducer"
.format(expr.method, nt, i))
if not expr.method.isidentifier():
name, space, pn = expr.method.partition(' ')
if space == ' ' and name.isidentifier() and pn.isdigit():
pass
else:
raise ValueError(
"invalid grammar: invalid method name {!r} "
"(not an identifier), in grammar[{!r}[{}].reducer"
.format(expr.method, nt, i))
for arg_expr in expr.args:
check_reduce_expr(nt, i, rhs, arg_expr)
elif expr is None:
pass
elif isinstance(expr, Some):
check_reduce_expr(nt, i, rhs, expr.inner)
else:
raise TypeError(
"invalid grammar: unrecognized reduce expression {!r} "
"in grammar[{!r}][{}].reducer"
.format(expr, nt, i))
def copy_rhs(
nt: LenientNt,
i: int,
sole_production: bool,
rhs: LenientProduction,
context_params: typing.Tuple[str, ...]) -> Production:
if isinstance(rhs, list):
# Bare list, no reducer. Desugar to a Production, inferring a
# reasonable default reducer.
nargs = sum(1 for e in rhs if is_concrete_element(e))
reducer: ReduceExpr
if len(rhs) == 1 and nargs == 1:
reducer = 0 # don't call a method, just propagate the value
else:
# Call a method named after the production. If the
# nonterminal has exactly one production, there's no need
# to include the production index `i` to the method name.
if sole_production:
method = str(nt)
else:
method = '{}_{}'.format(nt, i)
reducer = CallMethod(method, tuple(range(nargs)))
rhs = Production(rhs, reducer)
if not isinstance(rhs, Production):
raise TypeError(
"invalid grammar: grammar[{!r}][{}] should be "
"a Production or list of grammar symbols, not {!r}"
.format(nt, i, rhs))
if rhs.condition is not None:
param, value = rhs.condition
if param not in context_params:
raise TypeError(
"invalid grammar: undefined parameter {!r} "
"in conditional for grammar[{!r}][{}]"
.format(param, nt, i))
if rhs.reducer != 'accept':
check_reduce_expr(nt, i, rhs, rhs.reducer)
assert isinstance(rhs.body, list)
return rhs.copy_with(body=[
validate_element(nt, i, j, e, context_params)
for j, e in enumerate(rhs.body)
])
def copy_nt_def(
nt: LenientNt,
nt_def: typing.Union[NtDef, typing.List[LenientProduction]],
) -> NtDef:
rhs_list: typing.Sequence[LenientProduction]
if isinstance(nt_def, NtDef):
for i, param in enumerate(nt_def.params):
if not isinstance(param, str):
raise TypeError(
"invalid grammar: parameter {} of {} should be "
"a string, not {!r}"
.format(i + 1, nt, param))
params = nt_def.params
rhs_list = nt_def.rhs_list
ty = nt_def.type
else:
params = ()
rhs_list = nt_def
ty = None
if not isinstance(rhs_list, list):
raise TypeError(
"invalid grammar: grammar[{!r}] should be either a "
"list of right-hand sides or NtDef, not {!r}"
.format(nt, type(rhs_list).__name__))
sole_production = len(rhs_list) == 1
productions = [copy_rhs(nt, i, sole_production, rhs, params)
for i, rhs in enumerate(rhs_list)]
return NtDef(params, productions, ty)
def check_nt_key(nt: LenientNt) -> None:
if isinstance(nt, str):
if not nt.isidentifier():
raise ValueError(
"invalid grammar: nonterminal names must be identifiers, not {!r}"
.format(nt))
if nt in self.variable_terminals or nt in self.synthetic_terminals:
raise TypeError(
"invalid grammar: {!r} is both a nonterminal and a variable terminal"
.format(nt))
elif isinstance(nt, Nt):
assert keys_are_nt # checked earlier
if not (isinstance(nt.name, (str, InitNt))
and isinstance(nt.args, tuple)):
raise TypeError(
"invalid grammar: expected str or Nt(name=str, "
"args=tuple) keys in nonterminals dict, got {!r}"
.format(nt))
check_nt_key(nt.name)
for pair in nt.args:
if (not isinstance(pair, tuple)
or len(pair) != 2
or not isinstance(pair[0], str)
or not isinstance(pair[1], bool)):
raise TypeError(
"invalid grammar: expected tuple((str, bool)) args, got {!r}"
.format(nt))
elif isinstance(nt, InitNt):
# Users don't include init nonterminals when initially creating
# a Grammar. They are automatically added below. But if this
# Grammar is being created by hacking on a previous Grammar, it
# will already have them.
if not isinstance(nt.goal, Nt):
raise TypeError(
"invalid grammar: InitNt.goal should be a nonterminal, "
"got {!r}"
.format(nt))
# nt.goal is a "use", not a "def". Check it like a use.
# Bogus question marks appear in error messages :-|
validate_element(nt, '?', '?', nt.goal, ())
if nt.goal not in my_goal_nts:
raise TypeError(
"invalid grammar: nonterminal referenced by InitNt "
"is not in the list of goals: {!r}"
.format(nt))
else:
raise TypeError(
"invalid grammar: expected string keys in nonterminals dict, got {!r}"
.format(nt))
def validate_nt(
nt: LenientNt,
nt_def: LenientNtDef
) -> typing.Tuple[LenientNt, NtDef]:
check_nt_key(nt)
if isinstance(nt, InitNt):
# Check the form of init productions. Initially these look like
# [[goal]], but after the pipeline goes to work, they can be
# [[Optional(goal)]] or [[], [goal]].
if not isinstance(nt_def, NtDef):
raise TypeError(
"invalid grammar: key {!r} must map to "
"value of type NtDef, not {!r}"
.format(nt, nt_def))
rhs_list = nt_def.rhs_list
g = nt.goal
if (rhs_list != [Production([g], 0),
Production([Nt(nt, ()), End()], 'accept')]
and rhs_list != [Production([Optional(g)], 0),
Production([Nt(nt, ()), End()], 'accept')]
and rhs_list != [Production([End()], 'accept'),
Production([g, End()], 'accept'),
Production([Nt(nt, ()), End()], 'accept')]):
raise ValueError(
"invalid grammar: grammar[{!r}] is not one of "
"the expected forms: got {!r}"
.format(nt, rhs_list))
return nt, copy_nt_def(nt, nt_def)
self.nonterminals = {}
for nt1, nt_def1 in my_nonterminals.items():
nt, nt_def = validate_nt(nt1, nt_def1)
self.nonterminals[nt] = nt_def
for syn_term_name, t_set in synthetic_terminals.items():
nt_def = NtDef((syn_term_name,), [Production([e], 0) for e in t_set], None)
nt, nt_def = validate_nt(syn_term_name, nt_def)
self.nonterminals[nt] = nt_def
# Remove synthetic terminals from the list of terminals.
all_terminals.remove(syn_term_name)
self.terminals = OrderedFrozenSet(all_terminals)
# Check types of reduce expressions and infer method types. But if the
# caller passed in precalculated type info, skip it -- otherwise we
# would redo type checking many times as we make minor changes to the
# Grammar along the pipeline.
if method_types is None:
types.infer_types(self)
else:
for nt, nt_def in self.nonterminals.items():
assert isinstance(nt_def, NtDef)
assert isinstance(nt_def.type, types.Type)
self.methods = method_types
# Synthesize "init" nonterminals.
self.init_nts = []
for goal in my_goal_nts:
# Convert str goals to Nt objects and validate.
if isinstance(goal, str):
ok = goal in str_to_nt
if ok:
goal = str_to_nt[goal]
elif isinstance(goal, Nt):
if keys_are_nt:
ok = goal in my_nonterminals
else:
ok = goal.name in my_nonterminals
if not ok:
raise ValueError(
"goal nonterminal {!r} is undefined".format(goal))
assert isinstance(goal, Nt)
# Weird, but the key of an init nonterminal really is
# `Nt(InitNt(Nt(goal_name, goal_args)), ())`. It takes no arguments,
# but it refers to a goal that might take arguments.
init_nt = InitNt(goal)
init_key: LenientNt = init_nt
goal_nt = Nt(init_nt, ())
if keys_are_nt:
init_key = goal_nt
if init_key not in self.nonterminals:
self.nonterminals[init_key] = NtDef(
(),
[Production([goal], 0),
Production([goal_nt, End()], 'accept')],
types.NoReturnType)
self.init_nts.append(goal_nt)
# Add the various execution backends which would rely on the same parse table.
self.exec_modes = exec_modes
self.type_to_modes = type_to_modes
# The argument is a list of extension.GrammarExtensions. The annotation is
# vague because this module does not import the extension module. It would
# be a cyclic dependency.
def patch(self, extensions: typing.List) -> None:
assert self.type_to_modes is not None
assert self.exec_modes is not None
if extensions == []:
return
# Copy of nonterminals which would be mutated by the patches.
nonterminals = copy.copy(self.nonterminals)
for ext in extensions:
# Add the given trait to the execution mode, depending on which
# type it got implemented for.
for mode in self.type_to_modes[ext.target.for_type]:
self.exec_modes[mode].add(ext.target.trait)
# Apply grammar transformations.
ext.apply_patch(self, nonterminals)
# Replace with the modified version of nonterminals
self.nonterminals = nonterminals
def intern(self, obj: Internable) -> Internable:
"""Return a shared copy of the immutable object `obj`.
This saves memory and consistent use allows code to use `is` for
equality testing.
"""
try:
return self._cache[obj]
except KeyError:
self._cache[obj] = obj
return obj
# Terminals are tokens that must appear verbatim in the input wherever they
# appear in the grammar, like the operators '+' '-' *' '/' and brackets '(' ')'
# in the example grammar.
def is_terminal(self, element: object) -> bool:
return type(element) is str
def expand_set_of_terminals(
self,
terminals: typing.Iterable[typing.Union[str, None, ErrorSymbol]]
) -> OrderedSet[typing.Union[str, None, ErrorSymbol]]:
"""Copy a set of terminals, replacing any synthetic terminals with their representations.
Returns a new OrderedSet.
terminals - an iterable of terminals and/or other unique elements like
None or ErrorSymbol.
"""
result: OrderedSet[typing.Union[str, None, ErrorSymbol]] = OrderedSet()
for t in terminals:
if isinstance(t, str) and t in self.synthetic_terminals:
result |= self.expand_set_of_terminals(self.synthetic_terminals[t])
else:
result.add(t)
return result
def goals(self) -> typing.List[Nt]:
"""Return a list of this grammar's goal nonterminals."""
return [init_nt.name.goal for init_nt in self.init_nts] # type: ignore
def with_nonterminals(
self,
nonterminals: typing.Mapping[LenientNt, LenientNtDef]
) -> Grammar:
"""Return a copy of self with the same attributes except different nonterminals."""
if self.methods is not None:
for nt_def in nonterminals.values():
assert isinstance(nt_def, NtDef)
assert nt_def.type is not None
return Grammar(
nonterminals,
goal_nts=self.goals(),
variable_terminals=self.variable_terminals,
synthetic_terminals=self.synthetic_terminals,
method_types=self.methods,
exec_modes=self.exec_modes,
type_to_modes=self.type_to_modes)
# === A few methods for dumping pieces of grammar.
def element_to_str(self, e: Element) -> str:
if isinstance(e, Nt):
return e.pretty()
elif self.is_terminal(e):
assert isinstance(e, str)
if e in self.variable_terminals or e in self.synthetic_terminals:
return e
return '"' + repr(e)[1:-1] + '"'
elif isinstance(e, Optional):
return self.element_to_str(e.inner) + "?"
elif isinstance(e, LookaheadRule):
if len(e.set) == 1:
op = "==" if e.positive else "!="
s = repr(list(e.set)[0])
else:
op = "in" if e.positive else "not in"
s = '{' + repr(list(e.set))[1:-1] + '}'
return "[lookahead {} {}]".format(op, s)
elif isinstance(e, End):
return "<END>"
elif e is NoLineTerminatorHere:
return "[no LineTerminator here]"
elif isinstance(e, CallMethod):
return "{{ {} }}".format(expr_to_str(e))
else:
return str(e)
def symbols_to_str(self, rhs: typing.Iterable[Element]) -> str:
return " ".join(self.element_to_str(e) for e in rhs)
def rhs_to_str(self, rhs: LenientProduction) -> str:
if isinstance(rhs, Production):
if rhs.condition is None:
prefix = ''
else:
param, value = rhs.condition
if value is True:
condition = "+" + param
elif value is False:
condition = "~" + param
else:
condition = "{} == {!r}".format(param, value)
prefix = "#[if {}] ".format(condition)
return prefix + self.rhs_to_str(rhs.body)
elif len(rhs) == 0:
return "[empty]"
else:
return self.symbols_to_str(rhs)
def nt_to_str(self, nt: LenientNt) -> str:
if isinstance(nt, Nt):
return self.element_to_str(nt)
else:
return str(nt)
def production_to_str(
self,
nt: LenientNt,
rhs: LenientProduction,
*reducer: ReduceExpr
) -> str:
# As we have two ways of representing productions at the moment, just
# take multiple arguments :(
return "{} ::= {}{}".format(
self.nt_to_str(nt),
self.rhs_to_str(rhs),
"".join(" => " + expr_to_str(expr) for expr in reducer))
# The type of `item` is `lr0.LRItem`. No annotation because this module
# does not import `lr0`. It would be a cyclic dependency.
def lr_item_to_str(self, prods: typing.List, item: typing.Any) -> str:
prod = prods[item.prod_index]
if item.lookahead is None:
la = []
else:
la = [self.element_to_str(item.lookahead)]
return "{} ::= {} >> {{{}}}".format(
self.element_to_str(prod.nt),
" ".join([self.element_to_str(e) for e in prod.rhs[:item.offset]]
+ ["\N{MIDDLE DOT}"]
+ la
+ [self.element_to_str(e) for e in prod.rhs[item.offset:]]),
", ".join(
"$" if t is None else self.element_to_str(t)
for t in item.followed_by)
)
def item_set_to_str(
self,
prods: typing.List,
item_set: OrderedFrozenSet
) -> str:
return "{{{}}}".format(
", ".join(self.lr_item_to_str(prods, item) for item in item_set)
)
def expand_terminal(self, t: str) -> OrderedFrozenSet[str]:
return self.synthetic_terminals.get(t) or OrderedFrozenSet([t])
def compatible_elements(self, e1: Element, e2: Element) -> bool:
# "type: ignore" because mypy doesn't know that `self.is_terminal(e1)`
# means `e1` is a terminal, and thus `self.expand_terminal(e1)` is OK.
return (e1 == e2
or (self.is_terminal(e1)
and self.is_terminal(e2)
and len(self.expand_terminal(e1) # type: ignore
& self.expand_terminal(e2)) > 0)) # type: ignore
def compatible_sequences(
self,
seq1: typing.Sequence[Element],
seq2: typing.Sequence[Element]) -> bool:
"""True if the two sequences could be the same terminals."""
return (len(seq1) == len(seq1)
and all(self.compatible_elements(e1, e2) for e1, e2 in zip(seq1, seq2)))
def dump(self) -> None:
for nt, nt_def in self.nonterminals.items():
left_side = self.nt_to_str(nt)
if nt_def.params:
left_side += "[" + ", ".join(nt_def.params) + "]"
print(left_side + " ::=")
for rhs in nt_def.rhs_list:
print(" ", self.rhs_to_str(rhs))
print()
def dump_type_info(self) -> None:
for nt, nt_def in self.nonterminals.items():
print(nt, nt_def.type)
for name, mty in self.methods.items():
print("fn {}({}) -> {}"
.format(name,
", ".join(str(ty) for ty in mty.argument_types),
str(mty.return_type)))
def is_shifted_element(self, e: Element) -> bool:
if isinstance(e, Nt):
return True
elif self.is_terminal(e):
return True
elif isinstance(e, Optional):
return True
elif isinstance(e, LookaheadRule):
return False
elif isinstance(e, End):
return True
elif e is NoLineTerminatorHere:
return True
return False
@dataclass(frozen=True)
class InitNt:
"""InitNt(goal) is the name of the init nonterminal for the given goal.
One init nonterminal is created internally for each goal symbol in the grammar.
The idea is to have a nonterminal that the user has no control over, that is
never used in any production, but *only* as an entry point for the grammar,
that always has a single production "init_nt ::= goal_nt". This predictable
structure makes it easier to get into and out of parsing at run time.
When an init nonterminal is matched, we take the "accept" action rather than
a "reduce" action.
"""
goal: Nt
# *** Elements ****************************************************************
#
# Elements are the things that can appear in the .body list of a Production:
#
# * Strings represent terminals (see `Grammar.is_terminal`)
#
# * `Nt` objects refer to nonterminals.
#
# * `Optional` objects represent optional elements.
#
# * `LookaheadRule` objects are like lookahead assertions in regular
# expressions.
#
# * The `NoLineTerminatorHere` singleton object can appear between two other
# symbols to rule out line breaks between them.
#
# * `ErrorSymbol` objects never match anything produced by the lexer. Instead
# they match an ErrorToken that's artificially injected into the token
# stream at runtime, by the parser itself, just before a token that does
# not match anything else.
def is_concrete_element(e: Element) -> bool:
"""True if parsing the element `e` produces a value.
A production's concrete elements can be used in reduce expressions.
"""
return not isinstance(e, (LookaheadRule, ErrorSymbol, NoLineTerminatorHereClass))
# Nonterminals in the ECMAScript grammar can be parameterized; NtParameter is
# the type of the parameters.
#
# For example, `BindingIdentifier[?Yield, ?Await]` is represented as
# `Nt('BindingIdentifier', (('Yield', Var('Yield')), ('Await', Var('Await'))))`.
#
# A nonterminal-parameter-expression is represented by either a Var object or
# the actual value, a boolean. (In theory, parameters don't *have* to be
# boolean; all the code would probably work for anything hashable. In practice,
# all parameters in the ECMAScript grammar are boolean.)
NtParameter = typing.Hashable
class Nt:
"""Nt(name, ((param0, arg0), ...)) - An invocation of a nonterminal.
Nonterminals are like lambdas. Each nonterminal in a grammar is defined by an
NtDef which has 0 or more parameters.
Parameter names `param0...` are strings. The actual arguments `arg0...` are
NtParameters (see above).
"""
__slots__ = ['name', 'args']
name: typing.Union[str, InitNt]
args: typing.Tuple[typing.Tuple[str, NtParameter], ...]
def __init__(self,
name: typing.Union[str, InitNt],
args: typing.Tuple[typing.Tuple[str, NtParameter], ...] = ()):
self.name = name
self.args = args
def __hash__(self) -> int:
return hash(('nt', self.name, self.args))
def __eq__(self, other: object) -> bool:
return (isinstance(other, Nt)
and (self.name, self.args) == (other.name, other.args))
def __repr__(self) -> str:
if self.args:
return 'Nt({!r}, {!r})'.format(self.name, self.args)
else:
return 'Nt({!r})'.format(self.name)
def pretty(self) -> str:
"""Unique version of this Nt to use in the Python runtime.
Also used in debug/verbose output.
"""
def arg_to_str(name: str, value: NtParameter) -> str:
if value is True:
return '+' + name
elif value is False:
return '~' + name
elif isinstance(value, Var):
if value.name == name:
return '?' + value.name
return name + "=" + value.name
else:
return name + "=" + repr(value)
if isinstance(self.name, InitNt):
return "Start_" + self.name.goal.pretty()
if len(self.args) == 0:
return self.name
return "{}[{}]".format(self.name,
", ".join(arg_to_str(name, value)
for name, value in self.args))
@dataclass(frozen=True)
class Optional:
"""Optional(nt) matches either nothing or the given nt.
Optional elements are expanded out before states are calculated, so the
core of the algorithm never sees them.
"""
inner: Element
@dataclass(frozen=True)
class Literal:
"""Literal(str) matches a sequence of characters.
Literal elements are sequences of characters which are expected to appear
verbatim in the input.
"""
text: str
@dataclass(frozen=True)
class UnicodeCategory:
"""UnicodeCategory(str) matches any character with a category matching
the cat_prefix.
UnicodeCategory elements are a set of literal elements which correspond to a
given unicode cat_prefix.
"""
cat_prefix: str
@dataclass(frozen=True)
class LookaheadRule:
"""LookaheadRule(set, pos) imposes a lookahead restriction on whatever follows.
It never consumes any tokens itself. Instead, the right-hand side
[LookaheadRule(frozenset(['a', 'b']), False), 'Thing']
matches a Thing that does not start with the token `a` or `b`.
"""
set: typing.FrozenSet[str]
positive: bool
# A lookahead restriction really just specifies a set of allowed terminals.
#
# - No lookahead restriction at all is equivalent to a rule specifying all terminals.
#
# - A positive lookahead restriction explicitly lists all allowed tokens.
#
# - A negative lookahead restriction instead specifies the set of all tokens
# except a few.
#
def lookahead_contains(rule: typing.Optional[LookaheadRule], t: str) -> bool:
"""True if the given lookahead restriction `rule` allows the terminal `t`."""
return (rule is None
or (t in rule.set if rule.positive
else t not in rule.set))
def lookahead_intersect(
a: typing.Optional[LookaheadRule],
b: typing.Optional[LookaheadRule]
) -> typing.Optional[LookaheadRule]:
"""Returns a single rule enforcing both `a` and `b`, allowing only terminals that pass both."""
if a is None:
return b
elif b is None:
return a
elif a.positive:
if b.positive:
return LookaheadRule(a.set & b.set, True)
else:
return LookaheadRule(a.set - b.set, True)
else:
if b.positive:
return LookaheadRule(b.set - a.set, True)
else:
return LookaheadRule(a.set | b.set, False)
class NoLineTerminatorHereClass:
def __str__(self) -> str:
return 'NoLineTerminatorHere'
NoLineTerminatorHere = NoLineTerminatorHereClass()
# Optional elements. These are expanded out before states are calculated,
# so the core of the algorithm never sees them.
@dataclass(frozen=True)
class Exclude:
"""Exclude(nt1, nt2) matches if nt1 matches and nt2 does not."""
inner: Element
exclusion_list: typing.Tuple[Element, ...]
# End. This is used to represent the terminal which is infinitely produced by
# the lexer when input end is reached.
@dataclass(frozen=True)
class End:
"""End() represents the end of the input content."""
# Special grammar symbol that can be consumed to handle a syntax error.
#
# The error code is passed to an error-handling routine at run time which
# decides if the error is recoverable or not.
@dataclass(frozen=True)
class ErrorSymbol:
"""Special grammar symbol that can be consumed to handle a syntax error."""
error_code: int
Element = typing.Union[
str,
Optional,
Literal,
UnicodeCategory,
Exclude,
Nt,
LookaheadRule,
End,
ErrorSymbol,
NoLineTerminatorHereClass,
CallMethod]
@dataclass
class NtDef:
"""Definition of a nonterminal.
Instances have three attributes:
.params - Tuple of strings, the names of the parameters.
.rhs_list - List of Production objects. Arguments to Nt elements in the
productions can be Var(s) where `s in params`, indicating that parameter
should be passed through unchanged.
.type - The type of runtime value produced by parsing an instance of this
nonterminal, or None.
An NtDef is a sort of lambda.
Some langauges have constructs that are allowed or disallowed in particular
situations. For example, in many languages `return` statements are allowed
only inside functions or methods. The ECMAScript standard (5.1.5 "Grammar
Notation") offers this example of the notation it uses to specify this sort
of thing:
StatementList [Return] :
[+Return] ReturnStatement
ExpressionStatement
This is an abbreviation for:
StatementList :
ExpressionStatement
StatementList_Return :
ReturnStatement
ExpressionStatement
We offer NtDef.params as a way of representing this in our system.
"StatementList": NtDef(("Return",), [
Production(["ReturnStatement"], condition=("Return", True)),
["ExpressionStatement"],
], None),
This is an abbreviation for:
"StatementList_0": [
["ExpressionStatement"],
],
"StatementList_1": [
["ReturnStatement"],
["ExpressionStatement"],
],
"""
__slots__ = ['params', 'rhs_list', 'type']
params: typing.Tuple[str, ...]
rhs_list: typing.List[Production]
type: typing.Optional[types.Type]
def with_rhs_list(self, new_rhs_list: typing.List[Production]) -> NtDef:
return dataclasses.replace(self, rhs_list=new_rhs_list)
@dataclass(frozen=True)
class Var:
"""Var(name) represents the run-time value of the parameter with the given name."""
name: str
ReduceExpr = typing.Union[int, CallMethod, None, Some]
ReduceExprOrAccept = typing.Union[ReduceExpr, str]
# The Grammar constructor is very lax about the types you pass to it. It can
# accept a `Dict[str, List[List[str]]]`, for example; it copies the data into
# NtDef and Production objects.
#
# The `Lenient` types below are the relaxed types that Grammar() actually
# accepts.
LenientNt = typing.Union[Nt, str, InitNt]
LenientProduction = typing.Union[Production, typing.List[Element]]
LenientNtDef = typing.Union[NtDef, typing.List[LenientProduction]]