lexer.py - mozsearch

Enable keyboard shortcuts

""" Lexical analysis is the breaking of a string into tokens. """

import re

import linecache

from builtins import SyntaxError as BaseSyntaxError

class SyntaxError(BaseSyntaxError):

    pass

class UnexpectedEndError(SyntaxError):

    pass

class LexicalGrammar:

    """Quick and dirty lexer implementation.

    In order to support multi-part lexing (multiple calls to .write()),

    both 1. the `ignore` regular expression; and 2. the union of the family of

    regular expressions given by `tokens` and `regexp`; must have have the

    following property: if they match a string s, they also match every prefix

    of that string.

    This requirement is not enforced by assertions; if it's not met, the

    tokenizer will just have bugs when sent multiple chunks of data.

"""

    def __init__(self, tokens, ignore=r'[ \t]*', **regexps):

        def token_to_re(token):

            s = re.escape(token)

            if s.isalpha():

                s += r'\b'

            return s

        token_list = sorted(tokens.split(), key=len, reverse=True)

        self.ignore_re = re.compile(ignore)

        self.token_re = re.compile("|".join(token_to_re(token) for token in token_list))

        self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()]

    def __call__(self, parser, filename=None):

        return Tokenizer(self, parser, filename)

class FlatStringLexer:

    def __init__(self, parser, filename=None):

        self.parser = parser

        self.src = ''

        self.previous_token_end = 0

        self.current_token_start = 0

        self.start_lineno = 1

        self.start_column = 0

        self.point = 0

        self.filename = filename

        self.closed = False

    def write(self, text):

        assert not self.closed

        self.src += text

        self._drain()

    def close(self):

        assert not self.closed

        self.closed = True

        self._drain()

        assert self.src == ''

        return self.parser.close(self)

    def _drain(self):

        assert self.previous_token_end == 0

        assert self.current_token_start == 0

        assert self.point == 0

        closing = self.closed

        terminal_id = self._match(closing)

        while terminal_id is not None:

            self.parser.write_terminal(self, terminal_id)

            terminal_id = self._match(closing)

        # Update position info.

        discarded_text = self.src[:self.point]

        newline_count = self.src[:self.point].count('\n')

        self.start_lineno += newline_count

        if newline_count > 0:

            self.start_column = self.point - discarded_text.rindex('\n')

        else:

            self.start_column += self.point

        # Drop the parsed text and reset counters. Note that setting

        # self.previous_token_end to 0 really is correct. Setting

        # self.current_token_start to 0 is as good as anything else, because

        # there is no current token.

        self.src = self.src[self.point:]

        self.point = 0

        self.previous_token_end = 0

        self.current_token_start = 0

    def current_token_position(self):

        src_pre = self.src[:self.current_token_start]

        lineno = self.start_lineno + src_pre.count("\n")

        if '\n' in src_pre:

            line_start_index = src_pre.rfind("\n") + 1

            column = self.current_token_start - line_start_index  # can be zero

        else:

            column = self.start_column + self.current_token_start

        return lineno, column

    def current_line(self):

        # OK, this is gruesome, but we return the current line if we have the

        # whole thing and otherwise we ... try loading it from disk.

        if '\n' in self.src[:self.current_token_start]:

            line_start = self.src.rindex('\n', 0, self.current_token_start) + 1

        elif self.start_column == 0:

            line_start = 0

        else:

            line_start = -1

        if line_start != -1:

            line_end = self.src.find('\n', line_start)

            if line_end == -1:

                if self.closed:

                    return self.src[line_start:] + '\n'

            else:

                return self.src[line_start:line_end] + '\n'

        # Fallback case. Python's linecache.getline() deliberately silences all

        # errors.

        lineno = self.current_token_position()[0]

        return linecache.getline(self.filename, lineno)

    def throw(self, msg_or_exception):

        lineno, column = self.current_token_position()

        if isinstance(msg_or_exception, Exception):

            e = msg_or_exception

            e.filename = self.filename

            e.lineno = lineno

            e.offset = column + 1

        else:

            # Apparently this is the secret handshake to create a Python

            # SyntaxError and get a good error message when Python prints it.

            line = self.current_line()

            args = (self.filename, lineno, column + 1, line)

            e = SyntaxError(msg_or_exception, args)

        raise e

    def throw_unexpected_end(self):

        self.throw(UnexpectedEndError("unexpected end of input"))

class Tokenizer(FlatStringLexer):

    def __init__(self, lexical_grammar, parser, filename=None):

        super().__init__(parser, filename)

        self.ignore_re = lexical_grammar.ignore_re

        self.token_re = lexical_grammar.token_re

        self.parser_pairs = lexical_grammar.parser_pairs

        self.src = ''

        self.filename = filename

        self.last_point = 0

        self.point = 0

        self._current_match = None

    def take(self):

        return self._current_match.group()

    def saw_line_terminator(self):

        """True if there's a LineTerminator before the current token."""

        i = self.previous_token_end

        j = self.current_token_start

        ws_between = self.src[i:j]

        return any(c in ws_between for c in '\r\n\u2028\u2029')

    def _match(self, closing):

        # Advance over text matching ignore_re.

        ignore_match = self.ignore_re.match(self.src, self.point)

        if ignore_match is None:

            raise ValueError("ignore_re should always match")

        point = ignore_match.end()

        if point == len(self.src):

            if closing:

                self.point = point

            self._current_match = None

            return None

        # Try the token_re.

        token_match = self.token_re.match(self.src, point)

        # Try all the parser_pairs.

        for name, pattern in self.parser_pairs:

            match = pattern.match(self.src, point)

            if match is not None:

                break

        else:

            name = match = None

        if match is not None and token_match is not None and match.end() > token_match.end():

            pass

        elif token_match is not None:

            name, match = token_match.group(0), token_match

        elif match is not None:

            pass

        else:

            self.throw("unexpected characters {!r}"

                       .format(self.src[point:point + 12]))

        # But how do we know subsequent .write() calls won't provide more text,

        # extending this token? Here we take advantage of the odd requirement

        # LexicalGrammar imposes on its users. Every prefix of a match is a

        # match. So if this hypothetical "extended" token would match, then the

        # entire remainder of self.src is a match.

        if not closing and match.end() == len(self.src):

            # This token might be extensible. Refuse to match.

            self._current_match = None

            return None

        # This token definitely is not extensible.

        self.previous_token_end = self.point

        self.current_token_start = match.start()

        self.point = match.end()

        self._current_match = match

        return name