Revision control

Copy as Markdown

Other Tools

"""Parse a grammar written in ECMArkup."""
from __future__ import annotations
# mypy: no-implicit-optional
import os
import collections
from typing import Dict, Iterable, Optional, Tuple
from jsparagus import parse_pgen, gen, grammar, extension, types
from jsparagus.lexer import LexicalGrammar
from jsparagus.ordered import OrderedSet, OrderedFrozenSet
ESGrammarLexer = LexicalGrammar(
# the operators and keywords:
"[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
"but empty here lookahead no not of one or returns through Some None impl for let",
NL="\n",
# any number of colons together
EQ=r':+',
# terminals of the ES grammar, quoted with backticks
T=r'`[^` \n]+`|```',
# also terminals, denoting control characters
CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',
# nonterminals/types that will be followed by parameters
NTCALL=r'[A-Za-z]\w*(?=[\[<])',
# nonterminals (also, boolean parameters and type names)
NT=r'[A-Za-z]\w*',
# nonterminals wrapped in vertical bars for no apparent reason
NTALT=r'\|[A-Z]\w+\|',
# the spec also gives a few productions names
PRODID=r'#[A-Za-z]\w*',
# prose not wrapped in square brackets
# To avoid conflict with the `>` token, this is recognized only after a space.
PROSE=r'(?<= )>[^\n]*',
# prose wrapped in square brackets
WPROSE=r'\[>[^]]*\]',
# expression denoting a matched terminal or nonterminal
MATCH_REF=r'\$(?:0|[1-9][0-9]*)',
# the spec also gives a few productions names
RUSTCOMMENT=r'//.*\n',
)
ESGrammarParser = gen.compile(
parse_pgen.load_grammar(
os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))
SIGIL_FALSE = '~'
SIGIL_TRUE = '+'
# Abbreviations for single-character terminals, used in the lexical grammar.
ECMASCRIPT_CODE_POINTS = {
'<ZWNJ>': grammar.Literal('\u200c'),
'<ZWJ>': grammar.Literal('\u200d'),
'<ZWNBSP>': grammar.Literal('\ufeff'),
'<TAB>': grammar.Literal('\t'),
'<VT>': grammar.Literal('\u000b'),
'<FF>': grammar.Literal('\u000c'),
'<SP>': grammar.Literal(' '),
'<NBSP>': grammar.Literal('\u00a0'),
# <ZWNBSP> already defined above
'<USP>': grammar.UnicodeCategory('Zs'),
'<LF>': grammar.Literal('\u000a'),
'<CR>': grammar.Literal('\u000d'),
'<LS>': grammar.Literal('\u2028'),
'<PS>': grammar.Literal('\u2028'),
}
class ESGrammarBuilder:
def __init__(self, terminal_names):
# Names of terminals that are written as nonterminals in the grammar.
# For example, "BooleanLiteral" is a terminal name when parsing the
# syntactic grammar.
if terminal_names is None:
terminal_names = frozenset()
self.terminal_names = frozenset(terminal_names)
self.reset()
def reset(self):
self.lexer = None
# This is how full-parsing and lazy-parsing are implemented, using
# different traits.
#
# This field contains the Rust's trait used for calling the method.
# When a CallMethod is generated, it is assumed to be a function of
# this trait. The trait is used by the Rust backend to generate
# multiple backends which are implementing different set of traits.
# Having the trait on the function call is useful as a way to filter
# functions calls at code-generation time.
#
# This field is updated by the `rust_param_impl`, which is used in
# grammar extensions, and visited before producing any CallMethod.
self.method_trait = "AstBuilder"
def rust_edsl(self, impl, grammar):
return extension.GrammarExtension(impl, grammar, self.lexer.filename)
def rust_param_impl(self, trait, for_type, param):
self.method_trait = trait
return extension.ImplFor(param, trait, for_type)
def rust_impl(self, trait, impl_type):
return self.rust_param_impl(trait, impl_type, [])
def rust_nt_def(self, lhs, rhs_line):
# Right now, only focus on the syntactic grammar, and assume that all
# rules are patching existing grammar production by adding code.
return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))
def rust_rhs_line(self, symbols):
return self.rhs_line(None, symbols, None, None)
def rust_expr(self, expr):
assert isinstance(expr, grammar.CallMethod)
return expr
def empty(self):
return []
def single(self, x):
return [x]
def append(self, x, y):
return x + [y]
def concat(self, x, y):
return x + y
def blank_line(self):
return []
def nt_def_to_list(self, nt_def):
return [nt_def]
def to_production(self, lhs, i, rhs, is_sole_production):
"""Wrap a list of grammar symbols `rhs` in a Production object."""
body, reducer, condition = rhs
if reducer is None:
reducer = self.default_reducer(lhs, i, body, is_sole_production)
return grammar.Production(body, reducer, condition=condition)
def default_reducer(self, lhs, i, body, is_sole_production):
assert isinstance(lhs, grammar.Nt)
nt_name = lhs.name
nargs = sum(1 for e in body if grammar.is_concrete_element(e))
if is_sole_production:
method_name = nt_name
else:
method_name = '{} {}'.format(nt_name, i)
return self.expr_call(method_name, tuple(range(nargs)), None)
def needs_asi(self, lhs, p):
"""True if p is a production in which ASI can happen."""
# The purpose of the fake ForLexicalDeclaration production is to have a
# copy of LexicalDeclaration that does not trigger ASI.
#
# Two productions have body == [";"] -- one for EmptyStatement and one
# for ClassMember. Neither should trigger ASI.
#
# The only other semicolons that should not trigger ASI are the ones in
# `for` statement productions, which happen to be exactly those
# semicolons that are not at the end of a production.
return (not (isinstance(lhs, grammar.Nt)
and lhs.name == 'ForLexicalDeclaration')
and len(p.body) > 1
and p.body[-1] == ';')
def apply_asi(self, p, reducer_was_autogenerated):
"""Return two rules based on p, so that ASI can be applied."""
assert isinstance(p.reducer, grammar.CallMethod)
if reducer_was_autogenerated:
# Don't pass the semicolon to the method.
reducer = self.expr_call(p.reducer.method,
p.reducer.args[:-1],
None)
else:
reducer = p.reducer
# Except for do-while loops, check at runtime that ASI occurs only at
# the end of a line.
if (len(p.body) == 7
and p.body[0] == 'do'
and p.body[2] == 'while'
and p.body[3] == '('
and p.body[5] == ')'
and p.body[6] == ';'):
code = "do_while_asi"
else:
code = "asi"
return [
# The preferred production, with the semicolon in.
p.copy_with(body=p.body[:],
reducer=reducer),
# The fallback production, performing ASI.
p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
reducer=reducer),
]
def expand_lexical_rhs(self, rhs):
body, reducer, condition = rhs
out = []
for e in body:
if isinstance(e, str):
# The terminal symbols of the lexical grammar are characters, so
# add each character of this string as a separate element.
out += [grammar.Literal(ch) for ch in e]
else:
out.append(e)
return [out, reducer, condition]
def nt_def(self, nt_type, lhs, eq, rhs_list):
has_sole_production = (len(rhs_list) == 1)
production_list = []
for i, rhs in enumerate(rhs_list):
if eq == ':':
# Syntactic grammar. A hack is needed for ASI.
reducer_was_autogenerated = rhs[1] is None
p = self.to_production(lhs, i, rhs, has_sole_production)
if self.needs_asi(lhs, p):
production_list += self.apply_asi(p, reducer_was_autogenerated)
else:
production_list.append(p)
elif eq == '::':
# Lexical grammar. A hack is needed to replace multicharacter
# terminals like `!==` into sequences of character terminals.
rhs = self.expand_lexical_rhs(rhs)
p = self.to_production(lhs, i, rhs, has_sole_production)
production_list.append(p)
return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))
def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])
def nt_lhs_no_params(self, name):
return grammar.Nt(name, ())
def nt_lhs_with_params(self, name, params):
return grammar.Nt(name, tuple(params))
def simple_type(self, name):
return types.Type(name)
def lifetime_type(self, name):
return types.Lifetime(name)
def parameterized_type(self, name, args):
return types.Type(name, tuple(args))
def t_list_line(self, terminals):
return terminals
def terminal(self, t):
assert t[0] == "`"
assert t[-1] == "`"
return t[1:-1]
def terminal_chr(self, chr):
raise ValueError("FAILED: %r" % chr)
def rhs_line(self, ifdef, rhs, reducer, _prodid):
return (rhs, reducer, ifdef)
def rhs_line_prose(self, prose):
return ([prose], None, None)
def empty_rhs(self):
return []
def expr_match_ref(self, token):
assert token.startswith('$')
return int(token[1:])
def expr_call(self, method, args, fallible):
# NOTE: Currently "AstBuilder" functions are made fallible using the
# fallible_methods taken from some Rust code which extract this
# information to produce a JSON file.
if self.method_trait == "AstBuilder":
fallible = None
return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
fallible is not None)
def expr_some(self, expr):
return grammar.Some(expr)
def expr_none(self):
return None
def ifdef(self, value, nt):
return nt, value
def optional(self, nt):
return grammar.Optional(nt)
def but_not(self, nt, exclusion):
_, exclusion = exclusion
return grammar.Exclude(nt, [exclusion])
# return ('-', nt, exclusion)
def but_not_one_of(self, nt, exclusion_list):
exclusion_list = [exclusion for _, exclusion in exclusion_list]
return grammar.Exclude(nt, exclusion_list)
# return ('-', nt, exclusion_list)
def no_line_terminator_here(self, lt):
if lt not in ('LineTerminator', '|LineTerminator|'):
raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
return grammar.NoLineTerminatorHere
def nonterminal(self, name):
if name in self.terminal_names:
return name
return grammar.Nt(name, ())
def nonterminal_apply(self, name, args):
if name in self.terminal_names:
raise ValueError("parameters applied to terminal {!r}".format(name))
if len(set(k for k, expr in args)) != len(args):
raise ValueError("parameter passed multiple times")
return grammar.Nt(name, tuple(args))
def arg_expr(self, sigil, argname):
if sigil == '?':
return (argname, grammar.Var(argname))
else:
return (argname, sigil)
def sigil_false(self):
return False
def sigil_true(self):
return True
def exclusion_terminal(self, t):
return ("t", t)
def exclusion_nonterminal(self, nt):
return ("nt", nt)
def exclusion_chr_range(self, c1, c2):
return ("range", c1, c2)
def la_eq(self, t):
return grammar.LookaheadRule(OrderedFrozenSet([t]), True)
def la_ne(self, t):
return grammar.LookaheadRule(OrderedFrozenSet([t]), False)
def la_not_in_nonterminal(self, nt):
return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)
def la_not_in_set(self, lookahead_exclusions):
if all(len(excl) == 1 for excl in lookahead_exclusions):
return grammar.LookaheadRule(
OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
False)
raise ValueError("unsupported: lookahead > 1 token, {!r}"
.format(lookahead_exclusions))
def chr(self, t):
assert t[0] == "<" or t[0] == 'U'
if t[0] == "<":
assert t[-1] == ">"
if t not in ECMASCRIPT_CODE_POINTS:
raise ValueError("unrecognized character abbreviation {!r}".format(t))
return ECMASCRIPT_CODE_POINTS[t]
else:
assert t[1] == "+"
return grammar.Literal(chr(int(t[2:], base=16)))
def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
single_grammar=True, extensions=[]):
nt_grammars = {}
for nt_name, eq, _ in nt_defs:
if nt_name in nt_grammars:
raise ValueError(
"duplicate definitions for nonterminal {!r}"
.format(nt_name))
nt_grammars[nt_name] = eq
# Figure out which grammar we were trying to get (":" for syntactic,
# "::" for lexical) based on the goal symbols.
goals = list(goals)
if len(goals) == 0:
raise ValueError("no goal nonterminals specified")
if single_grammar:
selected_grammars = set(nt_grammars[goal] for goal in goals)
assert len(selected_grammars) != 0
if len(selected_grammars) > 1:
raise ValueError(
"all goal nonterminals must be part of the same grammar; "
"got {!r} (matching these grammars: {!r})"
.format(set(goals), set(selected_grammars)))
[selected_grammar] = selected_grammars
terminal_set = set()
def hack_production(p):
for i, e in enumerate(p.body):
if isinstance(e, str) and e[:1] == "`":
if len(e) < 3 or e[-1:] != "`":
raise ValueError(
"Unrecognized grammar symbol: {!r} (in {!r})"
.format(e, p))
p[i] = token = e[1:-1]
terminal_set.add(token)
nonterminals = {}
for nt_name, eq, rhs_list_or_lambda in nt_defs:
if single_grammar and eq != selected_grammar:
continue
if isinstance(rhs_list_or_lambda, grammar.NtDef):
nonterminals[nt_name] = rhs_list_or_lambda
else:
rhs_list = rhs_list_or_lambda
for p in rhs_list:
if not isinstance(p, grammar.Production):
raise ValueError(
"invalid grammar: ifdef in non-function-call context")
hack_production(p)
if nt_name in nonterminals:
raise ValueError(
"unsupported: multiple definitions for nt " + nt_name)
nonterminals[nt_name] = rhs_list
for t in terminal_set:
if t in nonterminals:
raise ValueError(
"grammar contains both a terminal `{}` and nonterminal {}"
.format(t, t))
# Add execution modes to generate the various functions needed to handle
# syntax parsing and full parsing execution modes.
exec_modes = collections.defaultdict(OrderedSet)
noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
token_parser = types.Type("ParserTrait", (
types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))
# Full parsing takes token as input and build an AST.
exec_modes["full_actions"].extend([token_parser, ast_builder])
# Syntax parsing takes token as input but skip building the AST.
# TODO: The syntax parser is commented out for now, as we need something to
# be produced when we cannot call the AstBuilder for producing the values.
# No-op parsing is used for the simulator, which is so far used for
# querying whether we can end the incremental input and lookup if a state
# can accept some kind of tokens.
exec_modes["noop_actions"].add(noop_parser)
# Extensions are using an equivalent of Rust types to define the kind of
# parsers to be used, this map is used to convert these type names to the
# various execution modes.
full_parser = types.Type("FullParser")
syntax_parser = types.Type("SyntaxParser")
noop_parser = types.Type("NoopParser")
type_to_modes = {
noop_parser: ["noop_actions", "full_actions"],
syntax_parser: ["full_actions"],
full_parser: ["full_actions"],
}
result = grammar.Grammar(
nonterminals,
goal_nts=goals,
variable_terminals=variable_terminals,
synthetic_terminals=synthetic_terminals,
exec_modes=exec_modes,
type_to_modes=type_to_modes)
result.patch(extensions)
return result
def parse_esgrammar(
text: str,
*,
filename: Optional[str] = None,
extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
goals: Optional[Iterable[str]] = None,
terminal_names: Iterable[str] = (),
synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
single_grammar: bool = True
) -> grammar.Grammar:
if not text.endswith("\n\n"):
# Horrible hack: add a blank line at the end of the document so that
# the esgrammar grammar can use newlines as delimiters. :-P
text += "\n"
terminal_names = frozenset(terminal_names)
if synthetic_terminals is None:
synthetic_terminals = {}
builder = ESGrammarBuilder(terminal_names)
parser = ESGrammarParser(builder=builder, goal="grammar")
lexer = ESGrammarLexer(parser, filename=filename)
lexer.write(text)
nt_defs = lexer.close()
grammar_extensions = []
for ext_filename, start_lineno, content in extensions:
builder.reset()
parser = ESGrammarParser(builder=builder, goal="rust_edsl")
lexer = ESGrammarLexer(parser, filename=ext_filename)
builder.lexer = lexer
lexer.start_lineno = start_lineno
lexer.write(content)
result = lexer.close()
grammar_extensions.append(result)
if goals is None:
# Default to the first nonterminal in the input.
goals = [nt_defs[0][0]]
return finish_grammar(
nt_defs,
goals=goals,
variable_terminals=terminal_names - frozenset(synthetic_terminals),
synthetic_terminals=synthetic_terminals,
single_grammar=single_grammar,
extensions=grammar_extensions)