Source code

Revision control

Copy as Markdown

Other Tools

""" Lexical analysis is the breaking of a string into tokens. """
import re
import linecache
from builtins import SyntaxError as BaseSyntaxError
class SyntaxError(BaseSyntaxError):
pass
class UnexpectedEndError(SyntaxError):
pass
class LexicalGrammar:
"""Quick and dirty lexer implementation.
In order to support multi-part lexing (multiple calls to .write()),
both 1. the `ignore` regular expression; and 2. the union of the family of
regular expressions given by `tokens` and `regexp`; must have have the
following property: if they match a string s, they also match every prefix
of that string.
This requirement is not enforced by assertions; if it's not met, the
tokenizer will just have bugs when sent multiple chunks of data.
"""
def __init__(self, tokens, ignore=r'[ \t]*', **regexps):
def token_to_re(token):
s = re.escape(token)
if s.isalpha():
s += r'\b'
return s
token_list = sorted(tokens.split(), key=len, reverse=True)
self.ignore_re = re.compile(ignore)
self.token_re = re.compile("|".join(token_to_re(token) for token in token_list))
self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()]
def __call__(self, parser, filename=None):
return Tokenizer(self, parser, filename)
class FlatStringLexer:
def __init__(self, parser, filename=None):
self.parser = parser
self.src = ''
self.previous_token_end = 0
self.current_token_start = 0
self.start_lineno = 1
self.start_column = 0
self.point = 0
self.filename = filename
self.closed = False
def write(self, text):
assert not self.closed
self.src += text
self._drain()
def close(self):
assert not self.closed
self.closed = True
self._drain()
assert self.src == ''
return self.parser.close(self)
def _drain(self):
assert self.previous_token_end == 0
assert self.current_token_start == 0
assert self.point == 0
closing = self.closed
terminal_id = self._match(closing)
while terminal_id is not None:
self.parser.write_terminal(self, terminal_id)
terminal_id = self._match(closing)
# Update position info.
discarded_text = self.src[:self.point]
newline_count = self.src[:self.point].count('\n')
self.start_lineno += newline_count
if newline_count > 0:
self.start_column = self.point - discarded_text.rindex('\n')
else:
self.start_column += self.point
# Drop the parsed text and reset counters. Note that setting
# self.previous_token_end to 0 really is correct. Setting
# self.current_token_start to 0 is as good as anything else, because
# there is no current token.
self.src = self.src[self.point:]
self.point = 0
self.previous_token_end = 0
self.current_token_start = 0
def current_token_position(self):
src_pre = self.src[:self.current_token_start]
lineno = self.start_lineno + src_pre.count("\n")
if '\n' in src_pre:
line_start_index = src_pre.rfind("\n") + 1
column = self.current_token_start - line_start_index # can be zero
else:
column = self.start_column + self.current_token_start
return lineno, column
def current_line(self):
# OK, this is gruesome, but we return the current line if we have the
# whole thing and otherwise we ... try loading it from disk.
if '\n' in self.src[:self.current_token_start]:
line_start = self.src.rindex('\n', 0, self.current_token_start) + 1
elif self.start_column == 0:
line_start = 0
else:
line_start = -1
if line_start != -1:
line_end = self.src.find('\n', line_start)
if line_end == -1:
if self.closed:
return self.src[line_start:] + '\n'
else:
return self.src[line_start:line_end] + '\n'
# Fallback case. Python's linecache.getline() deliberately silences all
# errors.
lineno = self.current_token_position()[0]
return linecache.getline(self.filename, lineno)
def throw(self, msg_or_exception):
lineno, column = self.current_token_position()
if isinstance(msg_or_exception, Exception):
e = msg_or_exception
e.filename = self.filename
e.lineno = lineno
e.offset = column + 1
else:
# Apparently this is the secret handshake to create a Python
# SyntaxError and get a good error message when Python prints it.
line = self.current_line()
args = (self.filename, lineno, column + 1, line)
e = SyntaxError(msg_or_exception, args)
raise e
def throw_unexpected_end(self):
self.throw(UnexpectedEndError("unexpected end of input"))
class Tokenizer(FlatStringLexer):
def __init__(self, lexical_grammar, parser, filename=None):
super().__init__(parser, filename)
self.ignore_re = lexical_grammar.ignore_re
self.token_re = lexical_grammar.token_re
self.parser_pairs = lexical_grammar.parser_pairs
self.src = ''
self.filename = filename
self.last_point = 0
self.point = 0
self._current_match = None
def take(self):
return self._current_match.group()
def saw_line_terminator(self):
"""True if there's a LineTerminator before the current token."""
i = self.previous_token_end
j = self.current_token_start
ws_between = self.src[i:j]
return any(c in ws_between for c in '\r\n\u2028\u2029')
def _match(self, closing):
# Advance over text matching ignore_re.
ignore_match = self.ignore_re.match(self.src, self.point)
if ignore_match is None:
raise ValueError("ignore_re should always match")
point = ignore_match.end()
if point == len(self.src):
if closing:
self.point = point
self._current_match = None
return None
# Try the token_re.
token_match = self.token_re.match(self.src, point)
# Try all the parser_pairs.
for name, pattern in self.parser_pairs:
match = pattern.match(self.src, point)
if match is not None:
break
else:
name = match = None
if match is not None and token_match is not None and match.end() > token_match.end():
pass
elif token_match is not None:
name, match = token_match.group(0), token_match
elif match is not None:
pass
else:
self.throw("unexpected characters {!r}"
.format(self.src[point:point + 12]))
# But how do we know subsequent .write() calls won't provide more text,
# extending this token? Here we take advantage of the odd requirement
# LexicalGrammar imposes on its users. Every prefix of a match is a
# match. So if this hypothetical "extended" token would match, then the
# entire remainder of self.src is a match.
if not closing and match.end() == len(self.src):
# This token might be extensible. Refuse to match.
self._current_match = None
return None
# This token definitely is not extensible.
self.previous_token_end = self.point
self.current_token_start = match.start()
self.point = match.end()
self._current_match = match
return name