Source code

Revision control

Copy as Markdown

Other Tools

#!/usr/bin/env python
"""parse_pgen.py - Parse grammars written in the pgen parser specification language.
I'm not sure I want to keep this pgen mini-language around; ignore this for now.
"""
import sys
from collections import namedtuple
from .lexer import LexicalGrammar
from .grammar import Grammar, Production, CallMethod, is_concrete_element, Optional
from . import gen
from . import parse_pgen_generated
pgen_lexer = LexicalGrammar(
"goal nt var token { } ; ? = => ( ) ,",
r'([ \t\r\n]|#.*)*',
IDENT=r'[A-Za-z_](?:\w|[_-])*',
STR=r'"[^\\\n"]*"',
MATCH=r'\$(?:0|[1-9][0-9]*)',
COMMENT=r'//.*',
)
def list_of(e, allow_comments=False):
nt = e + 's'
prods = [
Production([e], CallMethod('single', (0,))),
Production([nt, e], CallMethod('append', (0, 1))),
]
if allow_comments:
prods.append(Production(['COMMENT'], CallMethod('empty', (0,))))
return prods
def call_method(name, body):
arg_indexes = []
current = 0
for e in body:
if is_concrete_element(e):
if e not in discards:
arg_indexes.append(current)
current += 1
return CallMethod(name, tuple(arg_indexes))
def prod(body, reducer):
if isinstance(reducer, str):
reducer = call_method(reducer, body)
return Production(body, reducer)
discards = set('token var nt goal Some None = => ; ( ) { } , ?'.split())
pgen_grammar = Grammar(
{
'grammar': [
[Optional('token_defs'), 'nt_defs']
],
'token_defs': list_of('token_def'),
'token_def': [
prod(['token', 'IDENT', '=', 'STR', ';'], 'const_token'),
prod(['var', 'token', 'IDENT', ';'], 'var_token'),
],
'nt_defs': [
prod(['nt_def'], 'nt_defs_single'),
prod(['nt_defs', 'nt_def'], 'nt_defs_append'),
],
'nt_def': [
prod([Optional('COMMENT'), Optional('goal'), 'nt', 'IDENT', '{',
Optional('prods'), '}'], 'nt_def'),
],
'prods': list_of('prod', allow_comments=True),
'prod': [
prod(['terms', Optional('reducer'), ';'], 'prod'),
],
'terms': list_of('term'),
'term': [
['symbol'],
prod(['symbol', '?'], 'optional'),
],
'symbol': [
prod(['IDENT'], 'ident'),
prod(['STR'], 'str'),
],
'reducer': [
prod(['=>', 'expr'], 1)
],
'expr': [
prod(['MATCH'], 'expr_match'),
prod(['IDENT', '(', Optional('expr_args'), ')'], 'expr_call'),
prod(['Some', '(', 'expr', ')'], 'expr_some'),
prod(['None'], 'expr_none'),
],
'expr_args': [
prod(['expr'], 'args_single'),
prod(['expr_args', ',', 'expr'], 'args_append'),
],
},
goal_nts=['grammar'],
variable_terminals=['IDENT', 'STR', 'MATCH', 'COMMENT']
)
Literal = namedtuple("Literal", "chars")
default_token_list = [
("Var", "var"),
("Token", "token"),
("Goal", "goal"),
("Nt", "nt"),
("IDENT", None),
("STR", None),
("OpenBrace", "{"),
("CloseBrace", "}"),
("OpenParenthesis", "("),
("CloseParenthesis", ")"),
("Colon", ":"),
("EqualSign", "="),
("Asterisk", "*"),
("PlusSign", "+"),
("MinusSign", "-"),
("Slash", "/"),
("Semicolon", ";"),
("QuestionMark", "?"),
("RightArrow", "->"),
("Comma", ","),
]
class AstBuilder:
def grammar(self, token_defs, nt_defs):
nonterminals, goal_nts = nt_defs
return (token_defs or default_token_list, nonterminals, goal_nts)
def empty(self, value):
return []
def single(self, value):
return [value]
def append(self, values, value):
values.append(value)
return values
def const_token(self, name, picture):
assert picture[0] == '"'
assert picture[-1] == '"'
return (name, picture[1:-1])
def var_token(self, name):
return (name, None)
def comment(self, comment):
pass
def nt_defs_single(self, nt_def):
return self.nt_defs_append(({}, []), nt_def)
def nt_defs_append(self, grammar_in, nt_def):
is_goal, nt, prods = nt_def
grammar, goal_nts = grammar_in
if nt in grammar:
raise ValueError("multiple definitions for nt {}".format(nt))
grammar[nt] = prods
if is_goal:
goal_nts.append(nt)
return grammar, goal_nts
def nt_def(self, _comment, goal_kw, ident, prods):
is_goal = goal_kw == "goal"
prods = [Production(body, reducer) for body, reducer in prods]
return (is_goal, ident, prods)
def prod(self, symbols, reducer):
if reducer is None:
if sum(1 for e in symbols if is_concrete_element(e)) == 1:
reducer = 0
else:
raise ValueError("reducer required for {!r}".format(symbols))
return (symbols, reducer)
def optional(self, sym):
return Optional(sym)
def ident(self, sym):
return sym
def str(self, sym):
assert len(sym) > 1
assert sym[0] == '"'
assert sym[-1] == '"'
chars = sym[1:-1] # This is a bit sloppy.
return Literal(chars)
def expr_match(self, match):
assert match.startswith('$')
return int(match[1:])
def expr_call(self, ident, args):
return CallMethod(ident, tuple(args or ()))
def args_single(self, expr):
return [expr]
def args_append(self, args, arg):
args.append(arg)
return args
def check_grammar(result):
tokens, nonterminals, goal_nts = result
tokens_by_name = {}
tokens_by_image = {}
for name, image in tokens:
if name in tokens_by_name:
raise ValueError("token `{}` redeclared".format(name))
tokens_by_name[name] = image
if image is not None and image in tokens_by_image:
raise ValueError("multiple tokens look like \"{}\"".format(image))
tokens_by_image[image] = name
if name in nonterminals:
raise ValueError("`{}` is declared as both a token and a nonterminal (pick one)".format(name))
def check_element(nt, i, e):
if isinstance(e, Optional):
return Optional(check_element(nt, i, e.inner))
elif isinstance(e, Literal):
if e.chars not in tokens_by_image:
raise ValueError("in {} production {}: undeclared token \"{}\"".format(nt, i, e.chars))
return e.chars
else:
assert isinstance(e, str), e.__class__.__name__
if e in nonterminals:
return e
elif e in tokens_by_name:
image = tokens_by_name[e]
if image is not None:
return image
return e
else:
raise ValueError("in {} production {}: undeclared symbol {}".format(nt, i, e))
out = {nt: [] for nt in nonterminals}
for nt, rhs_list in nonterminals.items():
for i, p in enumerate(rhs_list):
out_rhs = [check_element(nt, i, e) for e in p.body]
out[nt].append(p.copy_with(body=out_rhs))
return (tokens, out, goal_nts)
def load_grammar(filename):
with open(filename) as f:
text = f.read()
parser = parse_pgen_generated.Parser(builder=AstBuilder())
lexer = pgen_lexer(parser, filename=filename)
lexer.write(text)
result = lexer.close()
tokens, nonterminals, goals = check_grammar(result)
variable_terminals = [name for name, image in tokens if image is None]
return Grammar(nonterminals,
goal_nts=goals,
variable_terminals=variable_terminals)
def regenerate():
import sys
gen.generate_parser(sys.stdout, pgen_grammar)
if __name__ == '__main__':
if sys.argv[1:] == ['--regenerate']:
regenerate()
else:
print("usage: python -m jsparagus.parse_pgen --regenerate")
sys.exit(1)