Source code

Revision control

Copy as Markdown

Other Tools

""" Functions for loading the ECMAScript lexical and syntactic grammars. """
from jsparagus.ordered import OrderedSet, OrderedFrozenSet
from jsparagus import gen, grammar
from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS
from .parse_esgrammar import parse_esgrammar
ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS: grammar.SyntheticTerminalsDict = {
# Theoretically, this should be the set of all Unicode characters, but that
# would take a lot of memory, and in practice, the set is not used.
'SourceCharacter': OrderedFrozenSet([]),
}
ECMASCRIPT_LEXICAL_GOAL_NTS = [
'WhiteSpace',
'InputElementDiv',
'InputElementRegExp',
]
def load_lexical_grammar(filename):
"""Load the ECMAScript lexical grammar."""
with open(filename) as f:
grammar_text = f.read()
g = parse_esgrammar(
grammar_text,
filename=filename,
goals=ECMASCRIPT_LEXICAL_GOAL_NTS,
synthetic_terminals=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS,
terminal_names=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS.keys())
return gen.expand_parameterized_nonterminals(g)
ECMASCRIPT_SYNTACTIC_GOAL_NTS = [
'Script',
'Module',
# 'FormalParameters',
# 'FunctionBody',
]
# Identifiers are complicated. A "synthetic terminal" is a shorthand symbol
# that stands for any one of a set of terminals. For example, *IdentifierName*
# stands for any token that looks like an identifier, including keywords.
#
# These sets must use the names of the terminals produced by the lexer. Except
# for `Name`, our lexer output uses the terminal symbols of the syntactic
# grammar, which include some nonterminals of the lexical grammar. The
# syntactic grammar uses `BooleanLiteral`, not `true` and `false`; and it uses
# `NullLiteral` instead of `null`.
ECMASCRIPT_SYNTHETIC_TERMINALS = {
'IdentifierName': OrderedSet([
'Name',
'BooleanLiteral',
'NullLiteral',
'NameWithEscape',
*ECMASCRIPT_FULL_KEYWORDS,
*ECMASCRIPT_CONDITIONAL_KEYWORDS
]) - OrderedSet(['true', 'false', 'null']),
'Identifier': OrderedSet([
'Name',
'NameWithEscape',
*ECMASCRIPT_CONDITIONAL_KEYWORDS
]),
}
# Lexical nonterminals that are used as terminals in the syntactic grammar.
ECMASCRIPT_TOKEN_NAMES = [
'BooleanLiteral',
'IdentifierName',
'PrivateIdentifier',
'NoSubstitutionTemplate',
'NullLiteral',
'NumericLiteral',
'BigIntLiteral',
'RegularExpressionLiteral',
'StringLiteral',
'TemplateHead',
'TemplateMiddle',
'TemplateTail',
]
# List of all terminals, other than keywords, that our (hand-coded) lexer
# produces.
#
# (What our lexer implements for IdentifierName and friends is a slight
# variation on the spec. See `ECMASCRIPT_SYNTHETIC_TERMINALS` above.)
TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR = ECMASCRIPT_TOKEN_NAMES + [
'Identifier',
'Name',
]
def load_syntactic_grammar(filename, extensions):
"""Load the ECMAScript syntactic grammar."""
with open(filename) as f:
grammar_text = f.read()
extensions_content = []
for ext_filename in extensions:
# Extract grammar_extension! macro content, and store in a list.
with open(ext_filename) as ext_file:
content = None
start_line = 0
for lineno, line in enumerate(ext_file):
if line.startswith("grammar_extension!"):
assert line.endswith("{\n")
content = ""
# +2: enumerate starts at 0, while the first line is 1.
# Also, the first line added to the content variable is the
# next one.
start_line = lineno + 2
continue
if line.startswith("}") and content:
extensions_content.append((ext_filename, start_line, content))
content = None
continue
if content is not None:
content += line
g = parse_esgrammar(
grammar_text,
filename=filename,
extensions=extensions_content,
goals=ECMASCRIPT_SYNTACTIC_GOAL_NTS,
synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS,
terminal_names=TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR)
return g