load_es_grammar.py

Enable keyboard shortcuts

""" Functions for loading the ECMAScript lexical and syntactic grammars. """

from jsparagus.ordered import OrderedSet, OrderedFrozenSet

from jsparagus import gen, grammar

from .lexer import ECMASCRIPT_FULL_KEYWORDS, ECMASCRIPT_CONDITIONAL_KEYWORDS

from .parse_esgrammar import parse_esgrammar

ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS: grammar.SyntheticTerminalsDict = {

    # Theoretically, this should be the set of all Unicode characters, but that

    # would take a lot of memory, and in practice, the set is not used.

    'SourceCharacter': OrderedFrozenSet([]),

ECMASCRIPT_LEXICAL_GOAL_NTS = [

    'WhiteSpace',

    'InputElementDiv',

    'InputElementRegExp',

def load_lexical_grammar(filename):

    """Load the ECMAScript lexical grammar."""

    with open(filename) as f:

        grammar_text = f.read()

    g = parse_esgrammar(

        grammar_text,

        filename=filename,

        goals=ECMASCRIPT_LEXICAL_GOAL_NTS,

        synthetic_terminals=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS,

        terminal_names=ECMASCRIPT_LEXICAL_SYNTHETIC_TERMINALS.keys())

    return gen.expand_parameterized_nonterminals(g)

ECMASCRIPT_SYNTACTIC_GOAL_NTS = [

    'Script',

    'Module',

    # 'FormalParameters',

    # 'FunctionBody',

# Identifiers are complicated. A "synthetic terminal" is a shorthand symbol

# that stands for any one of a set of terminals. For example, *IdentifierName*

# stands for any token that looks like an identifier, including keywords.

# These sets must use the names of the terminals produced by the lexer.  Except

# for `Name`, our lexer output uses the terminal symbols of the syntactic

# grammar, which include some nonterminals of the lexical grammar. The

# syntactic grammar uses `BooleanLiteral`, not `true` and `false`; and it uses

# `NullLiteral` instead of `null`.

ECMASCRIPT_SYNTHETIC_TERMINALS = {

    'IdentifierName': OrderedSet([

        'Name',

        'BooleanLiteral',

        'NullLiteral',

        'NameWithEscape',

        *ECMASCRIPT_FULL_KEYWORDS,

        *ECMASCRIPT_CONDITIONAL_KEYWORDS

    ]) - OrderedSet(['true', 'false', 'null']),

    'Identifier': OrderedSet([

        'Name',

        'NameWithEscape',

        *ECMASCRIPT_CONDITIONAL_KEYWORDS

]),

# Lexical nonterminals that are used as terminals in the syntactic grammar.

ECMASCRIPT_TOKEN_NAMES = [

    'BooleanLiteral',

    'IdentifierName',

    'PrivateIdentifier',

    'NoSubstitutionTemplate',

    'NullLiteral',

    'NumericLiteral',

    'BigIntLiteral',

    'RegularExpressionLiteral',

    'StringLiteral',

    'TemplateHead',

    'TemplateMiddle',

    'TemplateTail',

# List of all terminals, other than keywords, that our (hand-coded) lexer

# produces.

# (What our lexer implements for IdentifierName and friends is a slight

# variation on the spec. See `ECMASCRIPT_SYNTHETIC_TERMINALS` above.)

TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR = ECMASCRIPT_TOKEN_NAMES + [

    'Identifier',

    'Name',

def load_syntactic_grammar(filename, extensions):

    """Load the ECMAScript syntactic grammar."""

    with open(filename) as f:

        grammar_text = f.read()

    extensions_content = []

    for ext_filename in extensions:

        # Extract grammar_extension! macro content, and store in a list.

        with open(ext_filename) as ext_file:

            content = None

            start_line = 0

            for lineno, line in enumerate(ext_file):

                if line.startswith("grammar_extension!"):

                    assert line.endswith("{\n")

                    content = ""

                    # +2: enumerate starts at 0, while the first line is 1.

                    # Also, the first line added to the content variable is the

                    # next one.

                    start_line = lineno + 2

                    continue

                if line.startswith("}") and content:

                    extensions_content.append((ext_filename, start_line, content))

                    content = None

                    continue

                if content is not None:

                    content += line

    g = parse_esgrammar(

        grammar_text,

        filename=filename,

        extensions=extensions_content,

        goals=ECMASCRIPT_SYNTACTIC_GOAL_NTS,

        synthetic_terminals=ECMASCRIPT_SYNTHETIC_TERMINALS,

        terminal_names=TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR)

    return g