lexer.py - mozsearch

comm-central/third_party/rust/jsparagus/js_parser/lexer.py

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

"""Vague approximation of an ECMAScript lexer.

A parser has two levels: the *lexer* scans bytes to produce tokens. The

*parser* consumes tokens and produces ASTs.

In a traditional design, the parser drives the process. It *pulls* one token at

a time from the lexer. However, for a parser that can accept arbitrary slabs of

data, scan them, then keep going, it makes more sense for the user to feed

those slabs to the lexer, which then *pushes* tokens to the parser. So that's

what we do.

Usage:

    from js_parser.lexer import JSLexer

    from js_parser.parser import JSParser

    lexer = JSLexer(JSParser())

    lexer.write(some_source_text)

    lexer.write(some_more_source_text)

    ast = lexer.close()

"""

import re

import jsparagus.lexer

def _get_punctuators():

    punctuators = '''

        &&= ||= ??=

        { ( ) [ ] . ... ; , < > <= >= == != === !== + - * % ** ++ --

        << >> >>> & | ^ ! ~ && || ? : = += -= *= %=

        **= ><<= >>= >>>= &= |= ^= =>

    '''.split()

    return '|'.join(

        re.escape(token)

        for token in sorted(punctuators, key=len, reverse=True))

TOKEN_RE = re.compile(r'''(?x)

(?:

      # WhiteSpace

      [\ \t\v\r\n\u00a0\u2028\u2029\ufeff]

      # SingleLineComment

    | // [^\r\n\u2028\u2029]* (?= [\r\n\u2028\u2029] | \Z )

      # MultiLineComment

    | /\*  (?: [^*] | \*+[^/] )*  \*+/

)*

      # Incomplete MultiLineComment

      /\*  (?: [^*] | \*+[^/] )*  \**

    | # Incomplete SingleLineComment

      // [^\r\n\u2028\u2029]*

    | # IdentifierName

      (?: [$_A-Za-z]     | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})

      (?: [$_0-9A-Za-z]  | \\ u [0-9A-Fa-f]{4} | \\ u \{ [0-9A-Fa-f]+ \})*

    | # NumericLiteral

      [0-9][0-9A-Za-z]*(?:\.[0-9A-Za-z]*)?

    | \.[0-9][0-9A-Za-z]*

    | # Punctuator

      <INSERT_PUNCTUATORS>

    | # The slash special case

    | # The curly brace special case

    | # StringLiteral

        # SingleStringCharacters

(?:

            # SourceCharacter but not one of ' or \\ or LineTerminator

            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR

            [^'\\\r\n]

          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence

          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence

          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence

          | \\ u \{ [0-9A-Fa-f]+ \}

          | \\\r\n?                      # LineContinuation

          | \\[\n\u2028\u2029]

)*

| "

        # DoubleStringCharacters

(?:

            # SourceCharacter but not one of " or \\ or LineTerminator

            # but also allow LINE SEPARATOR or PARAGRAPH SEPARATOR

            [^"\\\r\n]

          | \\ [^0-9xu\r\n\u2028\u2029]  # CharacterEscapeSequence

          | \\ x [0-9A-Fa-f]{2}          # HexEscapeSequence

          | \\ u [0-9A-Fa-f]{4}          # UnicodeEscapeSequence

          | \\ u \{ [0-9A-Fa-f]+ \}

          | \\\r\n?                      # LineContinuation

          | \\[\n\u2028\u2029]

)*

    | # Template

      ` (?: [^`\\$] | \\. )* (?: \${ | ` )

    | # illegal character or end of input (this branch matches no characters)

'''.replace("<INSERT_PUNCTUATORS>", _get_punctuators()))

DIV_RE = re.compile(r'(/=?)')

REGEXP_RE = re.compile(r'''(?x)

(?:

        # RegularExpressionFirstChar - implemented using

        #     RegularExpressionChars on the theory that we have already

        #     ruled out the possibility of a comment.

        # RegularExpressionChars

(?:

            # RegularExpressionNonTerminator but not one of \\ or / or [

            [^/\\\[\r\n\u2028\u2029]

          | # RegularExpressionBackslashSequence

            \\ [^\r\n\u2028\u2029]

          | # RegularExpressionClass

\[

                # RegularExpressionClassChars

(?:

                    # RegularExpressionNonTerminator but not one of ] or \\

                    [^]\\\r\n\u2028\u2029]

                  | # RegularExpressionBackslashSequence

                    \\ [^\r\n\u2028\u2029]

)*

\]

)+

    (?: \w* )

''')

# Words that never match Identifier. (`await` and `yield` nonetheless

# conditionally match IdentifierReference, BindingIdentifier, and

# LabelIdentifier.)

# Technically the term for these is "reserved word", not "keyword", but

# whatever.

ECMASCRIPT_FULL_KEYWORDS = [

    'await',

    'break',

    'case',

    'catch',

    'class',

    'const',

    'continue',

    'debugger',

    'default',

    'delete',

    'do',

    'else',

    'enum',

    'export',

    'extends',

    'finally',

    'for',

    'function',

    'if',

    'import',

    'in',

    'instanceof',

    'new',

    'null',

    'return',

    'super',

    'switch',

    'this',

    'throw',

    'true',

    'false',

    'try',

    'typeof',

    'var',

    'void',

    'while',

    'with',

    'yield',

ECMASCRIPT_CONDITIONAL_KEYWORDS = [

    # Words that are identifiers except in strict mode

    'let',  # this one is also banned at the beginning of an ExpressionStatement

    'static',

    'implements',

    'interface',

    'package',

    'private',

    'protected',

    'public',

    # Words that are always allowed as identifiers, but are also keywords in

    # other contexts.

    'as',

    'async',

    'from',

    'get',

    'of',

    'set',

    'target',

# Technically this set includes a reserved word that isn't currently being used

# as a keyword in the grammar: `enum`.

ALL_KEYWORDS = set(ECMASCRIPT_FULL_KEYWORDS + ECMASCRIPT_CONDITIONAL_KEYWORDS)

class JSLexer(jsparagus.lexer.FlatStringLexer):

    """Vague approximation of an ECMAScript lexer. """

    def __init__(self, parser, filename=None):

        super().__init__(parser, filename)

    def _match(self, closing):

        match = TOKEN_RE.match(self.src, self.point)

        assert match is not None

        if match.end() == len(self.src) and not closing:

            # The current token runs right up against the end of the current

            # chunk of source and thus might continue in the next chunk. Do not

            # move self.point.

            return None

        token = match.group(1)

        if token == '':

            # Whitespace followed by end of input or illegal character.

            if match.end() == len(self.src):

                # End of input. Success!

                assert closing

                self.point = match.end()

                return None

            else:

                c = self.src[match.end()]

                self.throw("unexpected character: {!r}".format(c))

        c = token[0]

        t = None

        if c.isdigit() or c == '.' and token != '.':

            t = 'NumericLiteral'

        elif c.isalpha() or c in '$_':

            if token in ALL_KEYWORDS:  # TODO support strict mode

                if token == 'null':

                    t = 'NullLiteral'

                elif token in ('true', 'false'):

                    t = 'BooleanLiteral'

                else:

                    t = token

            else:

                t = 'Name'

        elif c == '/':

            if token.startswith(('/*', '//')):

                # Incomplete comment. (In non-closing mode, this is handled

                # above, immediately after the match.)

                assert match.end() == len(self.src)

                assert closing

                self.point = len(self.src)

                self.throw("incomplete comment at end of source")

            # We choose RegExp vs. division based on what the parser can

            # accept, a literal implementation of the spec.

            # To make this correct in combination with end-of-line ASI, make

            # the parser rewind the lexer one token and ask for it again in

            # that case, so that the lexer asks the can-accept question again.

            point = match.start(1)

            if self.parser.can_accept_terminal(self, 'RegularExpressionLiteral'):

                match = REGEXP_RE.match(self.src, point)

                if match is None:

                    if closing:

                        self.throw("unterminated regexp literal")

                    else:

                        return None

                token = 'RegularExpressionLiteral'

            else:

                match = DIV_RE.match(self.src, point)

                token = match.group(1)

            if not closing and match.end() == len(self.src):

                # At the end of a chunk, `/a*b/` could be the start of

                # `/a*b/g`, and `/` could be the start of `/=`.

                return None

            t = token

        elif c == '`':

            if token.endswith('`'):

                t = 'NoSubstitutionTemplate'

            else:

                t = 'TemplateHead'

        elif c == '"' or c == "'":

            t = 'StringLiteral'

        elif c == '}':

            # TODO: TemplateTail

            t = token

        elif c in '{()[];,~?:.<>=!+-*%&|^':

            t = token

        else:

            assert False

        self._current_match = match

        self.previous_token_end = self.point

        self.current_token_start = match.start(1)

        self.point = match.end()

        return t

    def take(self):

        return self._current_match.group(1)

    def saw_line_terminator(self):

        """True if there's a LineTerminator before the current token."""

        i = self.previous_token_end

        j = self.current_token_start

        ws_between = self.src[i:j]

        return any(c in ws_between for c in '\r\n\u2028\u2029')

    def can_close(self):

        match = TOKEN_RE.match(self.src)

        return match.group(1) == '' and self.parser.can_close()