GardenSnake.py - mozsearch

firefox-main/third_party/python/ply/example/GardenSnake/GardenSnake.py

Enable keyboard shortcuts

Source code

File a bug in Firefox Build System :: General

Revision control

Copy as Markdown

Other Tools

# GardenSnake - a parser generator demonstration program

# This implements a modified version of a subset of Python:

#  - only 'def', 'return' and 'if' statements

#  - 'if' only has 'then' clause (no elif nor else)

#  - single-quoted strings only, content in raw format

#  - numbers are decimal.Decimal instances (not integers or floats)

#  - no print statment; use the built-in 'print' function

#  - only < > == + - / * implemented (and unary + -)

#  - assignment and tuple assignment work

#  - no generators of any sort

#  - no ... well, no quite a lot

# Why?  I'm thinking about a new indentation-based configuration

# language for a project and wanted to figure out how to do it.  Once

# I got that working I needed a way to test it out.  My original AST

# was dumb so I decided to target Python's AST and compile it into

# Python code.  Plus, it's pretty cool that it only took a day or so

# from sitting down with Ply to having working code.

# This uses David Beazley's Ply from http://www.dabeaz.com/ply/

# This work is hereby released into the Public Domain. To view a copy of

# the public domain dedication, visit

# http://creativecommons.org/licenses/publicdomain/ or send a letter to

# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,

# California, 94105, USA.

# Portions of this work are derived from Python's Grammar definition

# and may be covered under the Python copyright and license

#          Andrew Dalke / Dalke Scientific Software, LLC

#             30 August 2006 / Cape Town, South Africa

# Changelog:

#  30 August - added link to CC license; removed the "swapcase" encoding

# Modifications for inclusion in PLY distribution

import sys

sys.path.insert(0, "../..")

from ply import *

##### Lexer ######

#import lex

import decimal

tokens = (

    'DEF',

    'IF',

    'NAME',

    'NUMBER',  # Python decimals

    'STRING',  # single quoted strings only; syntax of raw strings

    'LPAR',

    'RPAR',

    'COLON',

    'EQ',

    'ASSIGN',

    'LT',

    'GT',

    'PLUS',

    'MINUS',

    'MULT',

    'DIV',

    'RETURN',

    'WS',

    'NEWLINE',

    'COMMA',

    'SEMICOLON',

    'INDENT',

    'DEDENT',

    'ENDMARKER',

#t_NUMBER = r'\d+'

# taken from decmial.py but without the leading sign

def t_NUMBER(t):

    r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""

    t.value = decimal.Decimal(t.value)

    return t

def t_STRING(t):

    r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...

    t.value = t.value[1:-1].decode("string-escape")  # .swapcase() # for fun

    return t

t_COLON = r':'

t_EQ = r'=='

t_ASSIGN = r'='

t_LT = r'<'

t_GT = r'>'

t_PLUS = r'\+'

t_MINUS = r'-'

t_MULT = r'\*'

t_DIV = r'/'

t_COMMA = r','

t_SEMICOLON = r';'

# Ply nicely documented how to do this.

RESERVED = {

    "def": "DEF",

    "if": "IF",

    "return": "RETURN",

def t_NAME(t):

    r'[a-zA-Z_][a-zA-Z0-9_]*'

    t.type = RESERVED.get(t.value, "NAME")

    return t

# Putting this before t_WS let it consume lines with only comments in

# them so the latter code never sees the WS part.  Not consuming the

# newline.  Needed for "if 1: #comment"

def t_comment(t):

    r"[ ]*\043[^\n]*"  # \043 is '#'

    pass

# Whitespace

def t_WS(t):

    r' [ ]+ '

    if t.lexer.at_line_start and t.lexer.paren_count == 0:

        return t

# Don't generate newline tokens when inside of parenthesis, eg

#   a = (1,

#        2, 3)

def t_newline(t):

    r'\n+'

    t.lexer.lineno += len(t.value)

    t.type = "NEWLINE"

    if t.lexer.paren_count == 0:

        return t

def t_LPAR(t):

    r'\('

    t.lexer.paren_count += 1

    return t

def t_RPAR(t):

    r'\)'

    # check for underflow?  should be the job of the parser

    t.lexer.paren_count -= 1

    return t

def t_error(t):

    raise SyntaxError("Unknown symbol %r" % (t.value[0],))

    print "Skipping", repr(t.value[0])

    t.lexer.skip(1)

# I implemented INDENT / DEDENT generation as a post-processing filter

# The original lex token stream contains WS and NEWLINE characters.

# WS will only occur before any other tokens on a line.

# I have three filters.  One tags tokens by adding two attributes.

# "must_indent" is True if the token must be indented from the

# previous code.  The other is "at_line_start" which is True for WS

# and the first non-WS/non-NEWLINE on a line.  It flags the check so

# see if the new line has changed indication level.

# Python's syntax has three INDENT states

#  0) no colon hence no need to indent

#  1) "if 1: go()" - simple statements have a COLON but no need for an indent

#  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent

NO_INDENT = 0

MAY_INDENT = 1

MUST_INDENT = 2

# only care about whitespace at the start of a line

def track_tokens_filter(lexer, tokens):

    lexer.at_line_start = at_line_start = True

    indent = NO_INDENT

    saw_colon = False

    for token in tokens:

        token.at_line_start = at_line_start

        if token.type == "COLON":

            at_line_start = False

            indent = MAY_INDENT

            token.must_indent = False

        elif token.type == "NEWLINE":

            at_line_start = True

            if indent == MAY_INDENT:

                indent = MUST_INDENT

            token.must_indent = False

        elif token.type == "WS":

            assert token.at_line_start == True

            at_line_start = True

            token.must_indent = False

        else:

            # A real token; only indent after COLON NEWLINE

            if indent == MUST_INDENT:

                token.must_indent = True

            else:

                token.must_indent = False

            at_line_start = False

            indent = NO_INDENT

        yield token

        lexer.at_line_start = at_line_start

def _new_token(type, lineno):

    tok = lex.LexToken()

    tok.type = type

    tok.value = None

    tok.lineno = lineno

    return tok

# Synthesize a DEDENT tag

def DEDENT(lineno):

    return _new_token("DEDENT", lineno)

# Synthesize an INDENT tag

def INDENT(lineno):

    return _new_token("INDENT", lineno)

# Track the indentation level and emit the right INDENT / DEDENT events.

def indentation_filter(tokens):

    # A stack of indentation levels; will never pop item 0

    levels = [0]

    token = None

    depth = 0

    prev_was_ws = False

    for token in tokens:

        # if 1:

        # print "Process", token,

        # if token.at_line_start:

        # print "at_line_start",

        # if token.must_indent:

        # print "must_indent",

        # print

        # WS only occurs at the start of the line

        # There may be WS followed by NEWLINE so

        # only track the depth here.  Don't indent/dedent

        # until there's something real.

        if token.type == "WS":

            assert depth == 0

            depth = len(token.value)

            prev_was_ws = True

            # WS tokens are never passed to the parser

            continue

        if token.type == "NEWLINE":

            depth = 0

            if prev_was_ws or token.at_line_start:

                # ignore blank lines

                continue

            # pass the other cases on through

            yield token

            continue

        # then it must be a real token (not WS, not NEWLINE)

        # which can affect the indentation level

        prev_was_ws = False

        if token.must_indent:

            # The current depth must be larger than the previous level

            if not (depth > levels[-1]):

                raise IndentationError("expected an indented block")

            levels.append(depth)

            yield INDENT(token.lineno)

        elif token.at_line_start:

            # Must be on the same level or one of the previous levels

            if depth == levels[-1]:

                # At the same level

                pass

            elif depth > levels[-1]:

                raise IndentationError(

                    "indentation increase but not in new block")

            else:

                # Back up; but only if it matches a previous level

                try:

                    i = levels.index(depth)

                except ValueError:

                    raise IndentationError("inconsistent indentation")

                for _ in range(i + 1, len(levels)):

                    yield DEDENT(token.lineno)

                    levels.pop()

        yield token

    ### Finished processing ###

    # Must dedent any remaining levels

    if len(levels) > 1:

        assert token is not None

        for _ in range(1, len(levels)):

            yield DEDENT(token.lineno)

# The top-level filter adds an ENDMARKER, if requested.

# Python's grammar uses it.

def filter(lexer, add_endmarker=True):

    token = None

    tokens = iter(lexer.token, None)

    tokens = track_tokens_filter(lexer, tokens)

    for token in indentation_filter(tokens):

        yield token

    if add_endmarker:

        lineno = 1

        if token is not None:

            lineno = token.lineno

        yield _new_token("ENDMARKER", lineno)

# Combine Ply and my filters into a new lexer

class IndentLexer(object):

    def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):

        self.lexer = lex.lex(debug=debug, optimize=optimize,

                             lextab=lextab, reflags=reflags)

        self.token_stream = None

    def input(self, s, add_endmarker=True):

        self.lexer.paren_count = 0

        self.lexer.input(s)

        self.token_stream = filter(self.lexer, add_endmarker)

    def token(self):

        try:

            return self.token_stream.next()

        except StopIteration:

            return None

##########   Parser (tokens -> AST) ######

# also part of Ply

#import yacc

# I use the Python AST

from compiler import ast

# Helper function

def Assign(left, right):

    names = []

    if isinstance(left, ast.Name):

        # Single assignment on left

        return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)

    elif isinstance(left, ast.Tuple):

        # List of things - make sure they are Name nodes

        names = []

        for child in left.getChildren():

            if not isinstance(child, ast.Name):

                raise SyntaxError("that assignment not supported")

            names.append(child.name)

        ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]

        return ast.Assign([ast.AssTuple(ass_list)], right)

    else:

        raise SyntaxError("Can't do that yet")

# The grammar comments come from Python's Grammar/Grammar file

# NB: compound_stmt in single_input is followed by extra NEWLINE!

# file_input: (NEWLINE | stmt)* ENDMARKER

def p_file_input_end(p):

    """file_input_end : file_input ENDMARKER"""

    p[0] = ast.Stmt(p[1])

def p_file_input(p):

    """file_input : file_input NEWLINE

                  | file_input stmt

                  | NEWLINE

                  | stmt"""

    if isinstance(p[len(p) - 1], basestring):

        if len(p) == 3:

            p[0] = p[1]

        else:

            p[0] = []  # p == 2 --> only a blank line

    else:

        if len(p) == 3:

            p[0] = p[1] + p[2]

        else:

            p[0] = p[1]

# funcdef: [decorators] 'def' NAME parameters ':' suite

# ignoring decorators

def p_funcdef(p):

    "funcdef : DEF NAME parameters COLON suite"

    p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])

# parameters: '(' [varargslist] ')'

def p_parameters(p):

    """parameters : LPAR RPAR

                  | LPAR varargslist RPAR"""

    if len(p) == 3:

        p[0] = []

    else:

        p[0] = p[2]

# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) |

# highly simplified

def p_varargslist(p):

    """varargslist : varargslist COMMA NAME

                   | NAME"""

    if len(p) == 4:

        p[0] = p[1] + p[3]

    else:

        p[0] = [p[1]]

# stmt: simple_stmt | compound_stmt

def p_stmt_simple(p):

    """stmt : simple_stmt"""

    # simple_stmt is a list

    p[0] = p[1]

def p_stmt_compound(p):

    """stmt : compound_stmt"""

    p[0] = [p[1]]

# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE

def p_simple_stmt(p):

    """simple_stmt : small_stmts NEWLINE

                   | small_stmts SEMICOLON NEWLINE"""

    p[0] = p[1]

def p_small_stmts(p):

    """small_stmts : small_stmts SEMICOLON small_stmt

                   | small_stmt"""

    if len(p) == 4:

        p[0] = p[1] + [p[3]]

    else:

        p[0] = [p[1]]

# small_stmt: expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |

#    import_stmt | global_stmt | exec_stmt | assert_stmt

def p_small_stmt(p):

    """small_stmt : flow_stmt

                  | expr_stmt"""

    p[0] = p[1]

# expr_stmt: testlist (augassign (yield_expr|testlist) |

#                      ('=' (yield_expr|testlist))*)

# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |

#             '<<=' | '>>=' | '**=' | '//=')

def p_expr_stmt(p):

    """expr_stmt : testlist ASSIGN testlist

                 | testlist """

    if len(p) == 2:

        # a list of expressions

        p[0] = ast.Discard(p[1])

    else:

        p[0] = Assign(p[1], p[3])

def p_flow_stmt(p):

    "flow_stmt : return_stmt"

    p[0] = p[1]

# return_stmt: 'return' [testlist]

def p_return_stmt(p):

    "return_stmt : RETURN testlist"

    p[0] = ast.Return(p[2])

def p_compound_stmt(p):

    """compound_stmt : if_stmt

                     | funcdef"""

    p[0] = p[1]

def p_if_stmt(p):

    'if_stmt : IF test COLON suite'

    p[0] = ast.If([(p[2], p[4])], None)

def p_suite(p):

    """suite : simple_stmt

             | NEWLINE INDENT stmts DEDENT"""

    if len(p) == 2:

        p[0] = ast.Stmt(p[1])

    else:

        p[0] = ast.Stmt(p[3])

def p_stmts(p):

    """stmts : stmts stmt

             | stmt"""

    if len(p) == 3:

        p[0] = p[1] + p[2]

    else:

        p[0] = p[1]

# No using Python's approach because Ply supports precedence

# comparison: expr (comp_op expr)*

# arith_expr: term (('+'|'-') term)*

# term: factor (('*'|'/'|'%'|'//') factor)*

# factor: ('+'|'-'|'~') factor | power

# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'

def make_lt_compare((left, right)):

    return ast.Compare(left, [('<', right), ])

def make_gt_compare((left, right)):

    return ast.Compare(left, [('>', right), ])

def make_eq_compare((left, right)):

    return ast.Compare(left, [('==', right), ])

binary_ops = {

    "+": ast.Add,

    "-": ast.Sub,

    "*": ast.Mul,

    "/": ast.Div,

    "<": make_lt_compare,

    ">": make_gt_compare,

    "==": make_eq_compare,

unary_ops = {

    "+": ast.UnaryAdd,

    "-": ast.UnarySub,

precedence = (

    ("left", "EQ", "GT", "LT"),

    ("left", "PLUS", "MINUS"),

    ("left", "MULT", "DIV"),

def p_comparison(p):

    """comparison : comparison PLUS comparison

                  | comparison MINUS comparison

                  | comparison MULT comparison

                  | comparison DIV comparison

                  | comparison LT comparison

                  | comparison EQ comparison

                  | comparison GT comparison

                  | PLUS comparison

                  | MINUS comparison

                  | power"""

    if len(p) == 4:

        p[0] = binary_ops[p[2]]((p[1], p[3]))

    elif len(p) == 3:

        p[0] = unary_ops[p[1]](p[2])

    else:

        p[0] = p[1]

# power: atom trailer* ['**' factor]

# trailers enables function calls.  I only allow one level of calls

# so this is 'trailer'

def p_power(p):

    """power : atom

             | atom trailer"""

    if len(p) == 2:

        p[0] = p[1]

    else:

        if p[2][0] == "CALL":

            p[0] = ast.CallFunc(p[1], p[2][1], None, None)

        else:

            raise AssertionError("not implemented")

def p_atom_name(p):

    """atom : NAME"""

    p[0] = ast.Name(p[1])

def p_atom_number(p):

    """atom : NUMBER

            | STRING"""

    p[0] = ast.Const(p[1])

def p_atom_tuple(p):

    """atom : LPAR testlist RPAR"""

    p[0] = p[2]

# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME

def p_trailer(p):

    "trailer : LPAR arglist RPAR"

    p[0] = ("CALL", p[2])

# testlist: test (',' test)* [',']

# Contains shift/reduce error

def p_testlist(p):

    """testlist : testlist_multi COMMA

                | testlist_multi """

    if len(p) == 2:

        p[0] = p[1]

    else:

        # May need to promote singleton to tuple

        if isinstance(p[1], list):

            p[0] = p[1]

        else:

            p[0] = [p[1]]

    # Convert into a tuple?

    if isinstance(p[0], list):

        p[0] = ast.Tuple(p[0])

def p_testlist_multi(p):

    """testlist_multi : testlist_multi COMMA test

                      | test"""

    if len(p) == 2:

        # singleton

        p[0] = p[1]

    else:

        if isinstance(p[1], list):

            p[0] = p[1] + [p[3]]

        else:

            # singleton -> tuple

            p[0] = [p[1], p[3]]

# test: or_test ['if' or_test 'else' test] | lambdef

#  as I don't support 'and', 'or', and 'not' this works down to 'comparison'

def p_test(p):

    "test : comparison"

    p[0] = p[1]

# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)

# XXX INCOMPLETE: this doesn't allow the trailing comma

def p_arglist(p):

    """arglist : arglist COMMA argument

               | argument"""

    if len(p) == 4:

        p[0] = p[1] + [p[3]]

    else:

        p[0] = [p[1]]

# argument: test [gen_for] | test '=' test  # Really [keyword '='] test

def p_argument(p):

    "argument : test"

    p[0] = p[1]

def p_error(p):

    # print "Error!", repr(p)

    raise SyntaxError(p)

class GardenSnakeParser(object):

    def __init__(self, lexer=None):

        if lexer is None:

            lexer = IndentLexer()

        self.lexer = lexer

        self.parser = yacc.yacc(start="file_input_end")

    def parse(self, code):

        self.lexer.input(code)

        result = self.parser.parse(lexer=self.lexer)

        return ast.Module(None, result)

###### Code generation ######

from compiler import misc, syntax, pycodegen

class GardenSnakeCompiler(object):

    def __init__(self):

        self.parser = GardenSnakeParser()

    def compile(self, code, filename="<string>"):

        tree = self.parser.parse(code)

        # print  tree

        misc.set_filename(filename, tree)

        syntax.check(tree)

        gen = pycodegen.ModuleCodeGenerator(tree)

        code = gen.getCode()

        return code

####### Test code #######

compile = GardenSnakeCompiler().compile

code = r"""

print('LET\'S TRY THIS \\OUT')

#Comment here

def x(a):

    print('called with',a)

    if a == 1:

        return 2

    if a*2 > 10: return 999 / 4

        # Another comment here

    return a+2*3

ints = (1, 2,

   3, 4,

5)

print('mutiline-expression', ints)

t = 4+1/3*2+6*(9-5+1)

print('predence test; should be 34+2/3:', t, t==(34+2/3))

print('numbers', 1,2,3,4,5)

if 1:

a=9

 print(x(a))

print(x(1))

print(x(2))

print(x(8),'3')

print('this is decimal', 1/5)

print('BIG DECIMAL', 1.234567891234567e12345)

"""

# Set up the GardenSnake run-time environment

def print_(*args):

    print "-->", " ".join(map(str, args))

globals()["print"] = print_

compiled_code = compile(code)

exec compiled_code in globals()

print "Done"