<?php $val = 5; $result = substr( "foobar", 2*(7-$val) ); echo " : $result";
# coding=utf8 import ply.lex as lex # , tokens = ( 'PHPSTART', 'PHPVAR', 'PHPEQUAL', 'PHPFUNC', 'PHPSTRING', 'PHPECHO', 'PHPCOLON', 'PHPCOMA', 'PHPOPEN', 'PHPCLOSE', 'PHPNUM', 'PLUSMINUS', 'DIVMUL' ) # ident = r'[az]\w*' # t_ = t_PHPSTART = r'\<\?php' t_PHPVAR = r'\$'+ident # , ? t_PHPEQUAL = r'\=' t_PHPFUNC = ident t_PHPSTRING = r'"(\\.|[^"])*"' t_PHPECHO = r'echo' t_PHPCOLON = r';' t_PHPCOMA = r',' t_PHPOPEN = r'\(' t_PHPCLOSE = r'\)' t_PHPNUM = r'\d+' t_PLUSMINUS = r'\+|\-' t_DIVMUL = r'/|\*' # . , $var=$value $var = $value t_ignore = ' \r\n\t\f' # . def t_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) lexer = lex.lex(reflags=re.UNICODE | re.DOTALL) data = ''' <?php $result = substr("foobar", "bar"); echo $result; ''' lexer.input(data) while True: tok = lexer.token() # if not tok: break # print tok
def t_PHPVAR(t): r'\$[a-zA-Z]\w*' print ', ' + t.value # value - return t
LexToken (PHPSTART, '<? Php', 1,1) LexToken (PHPVAR, '$ val', 1.7) LexToken (PHPEQUAL, '=', 1.12) LexToken (PHPNUM, '5', 1.14) LexToken (PHPCOLON, ';', 1.15) LexToken (PHPVAR, '$ result', 1.17) LexToken (PHPEQUAL, '=', 1.25) LexToken (PHPFUNC, 'substr', 1.27) LexToken (PHPOPEN, '(', 1.33) LexToken (PHPSTRING, '"foobar"', 1.35) LexToken (PHPCOMA, ',', 1.43) LexToken (PHPNUM, '2', 1.45) LexToken (DIVMUL, '*', 1.46) LexToken (PHPOPEN, '(', 1.47) LexToken (PHPNUM, '7', 1.48) LexToken (PLUSMINUS, '-', 1.49) LexToken (PHPVAR, '$ val', 1.50) LexToken (PHPCLOSE, ')', 1.54) LexToken (PHPCLOSE, ')', 1.56) LexToken (PHPCOLON, ';', 1.57) LexToken (PHPFUNC, 'echo', 1.59) LexToken (PHPSTRING, '"\ xd1 \ x8d \ xd1 \ x82 \ xd0 \ xbe \ xd0 \ xbd \ xd0 \ xb0 \ xd1 \ x88 \ xd1 \ x80 \ xd0 \ xb5 \ xd0 \ xb7 \ xd1 \ x83 \ xd0 \ xbb \ xd1 \ x8c \ xd1 \ x82 \ xd0 \ xb0 \ xd1 \ x82: $ result "', 1.64) LexToken (PHPCOLON, ';', 1,107)
t_ignore = ' \r\t\f' def t_newline(t): r'\n+' t.lexer.lineno += len(t.value)
LexToken (PHPSTART, '<? Php', 2.1) LexToken (PHPVAR, '$ val', 3.7) LexToken (PHPEQUAL, '=', 3.12) LexToken (PHPNUM, '5', 3.14) LexToken (PHPCOLON, ';', 3.15) LexToken (PHPVAR, '$ result', 4.17) LexToken (PHPEQUAL, '=', 4.25) LexToken (PHPFUNC, 'substr', 4.27) LexToken (PHPOPEN, '(', 4.33) LexToken (PHPSTRING, '"foobar"', 4.35) LexToken (PHPCOMA, ',', 4.43) LexToken (PHPNUM, '2', 4.45) LexToken (DIVMUL, '*', 4.46) LexToken (PHPOPEN, '(', 4.47) LexToken (PHPNUM, '7', 4.48) LexToken (PLUSMINUS, '-', 4.49) LexToken (PHPVAR, '$ val', 4.50) LexToken (PHPCLOSE, ')', 4.54) LexToken (PHPCLOSE, ')', 4.56) LexToken (PHPCOLON, ';', 4.57) LexToken (PHPFUNC, 'echo', 5.59) LexToken (PHPSTRING, '"\ xd1 \ x8d \ xd1 \ x82 \ xd0 \ xbe \ xd0 \ xbd \ xd0 \ xb0 \ xd1 \ x88 \ xd1 \ x80 \ xd0 \ xb5 \ xd0 \ xb7 \ xd1 \ x83 \ xd0 \ xbb \ xd1 \ x8c \ xd1 \ x82 \ xd0 \ xb0 \ xd1 \ x82: $ result "', 5.64) LexToken (PHPCOLON, ';', 5,107)
states = ( ('string','exclusive'), )
# t_ = t_PHPSTART = r'\<\?php' t_ANY_PHPVAR = r'\$'+ident # , ? t_PHPEQUAL = r'\=' t_PHPFUNC = ident t_PHPECHO = r'echo' t_PHPCOLON = r';' t_PHPCOMA = r',' t_PHPOPEN = r'\(' t_PHPCLOSE = r'\)' t_PHPNUM = r'\d+' t_PLUSMINUS = r'\+|\-' t_DIVMUL = r'/|\*' # PHPSTRING , def t_ANY_PHPSTRING(t): # , . r'"' if t.lexer.current_state() == 'string': t.lexer.begin('INITIAL') # else: t.lexer.begin('string') # return t t_string_STR = r'(\\.|[^$"])+' # , # t_string_ignore = '' # , state # def t_string_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1)
@TOKEN(ident) def t_PHPFUNC(t): if t.value.lower() == 'echo': t.type = 'PHPECHO' return t
def t_comment(t): r'(/\*(.|\n)*?\*/)|(//.*)' pass
# coding=utf8 import ply.lex as lex from ply.lex import TOKEN import re states = ( ('string','exclusive'), ) # , tokens = ( 'PHPSTART', 'PHPVAR', 'PHPEQUAL', 'PHPFUNC', 'PHPSTRING', 'PHPECHO', 'PHPCOLON', 'PHPCOMA', 'PHPOPEN', 'PHPCLOSE', 'PHPNUM', 'PLUSMINUS', 'DIVMUL', 'STR' ) # ident = r'[az]\w*' # t_ = t_PHPSTART = r'\<\?php' t_ANY_PHPVAR = r'\$'+ident # , ? t_PHPEQUAL = r'\=' t_PHPCOLON = r';' t_PHPCOMA = r',' t_PHPOPEN = r'\(' t_PHPCLOSE = r'\)' t_PHPNUM = r'\d+' t_PLUSMINUS = r'\+|\-' t_DIVMUL = r'/|\*' @TOKEN(ident) def t_PHPFUNC(t): if t.value.lower() == 'echo': t.type = 'PHPECHO' return t # def t_comment(t): r'(/\*(.|\n)*?\*/)|(//.*)' pass # PHPSTRING , def t_ANY_PHPSTRING(t): # , . r'"' if t.lexer.current_state() == 'string': t.lexer.begin('INITIAL') # else: t.lexer.begin('string') # return t t_string_STR = r'(\\.|[^$"])+' # , # t_string_ignore = '' # , state # def t_string_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) # . , $var=$value $var = $value t_ignore = ' \r\t\f' def t_newline(t): r'\n+' t.lexer.lineno += len(t.value) # . def t_error(t): print "Illegal character '%s'" % t.value[0] t.lexer.skip(1) lexer = lex.lex(reflags=re.UNICODE | re.DOTALL | re.IGNORECASE) if __name__=="__main__": data = ''' <?php $val = 5; $result = substr( "foobar", 2*(7-$val) ); echo " : $result"; ''' lexer.input(data) while True: tok = lexer.token() # if not tok: break # print tok
php -> [PHPSTART phpbody]? phpbody -> [phpline phpcolons] * phpcolons -> [PHPCOLON] + phpline -> assign | func | [PHPECHO args] assign -> PHPVAR PHPEQUAL expr expr -> [fact | expr PLUSMINUS fact] fact -> [term | fact DIVMUL term] term -> [arg | PHPOPEN expr PHPCLOSE] func -> PHPFUNC PHPOPEN args PHPCLOSE args -> [expr [PHPCOMA expr] *]? arg -> string | phpvar | PHPNUM | func string -> PHPSTRING str PHPSTRING str -> [STR | str phpvar]? phpvar -> PHPVAR
def p_str(p): '''str : | STR | str phpvar''' if len(p) == 1: p[0] = Node('str', ['']) elif len(p) == 2: p[0] = Node('str', [p[1]]) else: p[0] = p[1].add_parts([p[2]])
'''str : | STR | str phpvar'''
def p_str_empty(p): '''str :''' p[0] = Node('str', ['']) def p_str_raw(p): '''str : STR''' p[0] = Node('str', [p[1]]) def p_str_var(p): '''str : str phpvar''' p[0] = p[1].add_parts([p[2]])
class Node: def parts_str(self): st = [] for part in self.parts: st.append( str( part ) ) return "\n".join(st) def __repr__(self): return self.type + ":\n\t" + self.parts_str().replace("\n", "\n\t") def add_parts(self, parts): self.parts += parts return self def __init__(self, type, parts): self.type = type self.parts = parts
# coding=utf8 from lexer import tokens import ply.yacc as yacc class Node: def parts_str(self): st = [] for part in self.parts: st.append( str( part ) ) return "\n".join(st) def __repr__(self): return self.type + ":\n\t" + self.parts_str().replace("\n", "\n\t") def add_parts(self, parts): self.parts += parts return self def __init__(self, type, parts): self.type = type self.parts = parts def p_php(p): '''php : | PHPSTART phpbody''' if len(p) == 1: p[0] = None else: p[0] = p[2] def p_phpbody(p): '''phpbody : | phpbody phpline phpcolons''' if len(p) > 1: if p[1] is None: p[1] = Node('body', []) p[0] = p[1].add_parts([p[2]]) else: p[0] = Node('body', []) def p_phpcolons(p): '''phpcolons : PHPCOLON | phpcolons PHPCOLON''' def p_phpline(p): '''phpline : assign | func | PHPECHO args''' if len(p) == 2: p[0] = p[1] else: p[0] = Node('echo', [p[2]]) def p_assign(p): '''assign : PHPVAR PHPEQUAL expr''' p[0] = Node('assign', [p[1], p[3]]) def p_expr(p): '''expr : fact | expr PLUSMINUS fact''' if len(p) == 2: p[0] = p[1] else: p[0] = Node(p[2], [p[1], p[3]]) def p_fact(p): '''fact : term | fact DIVMUL term''' if len(p) == 2: p[0] = p[1] else: p[0] = Node(p[2], [p[1], p[3]]) def p_term(p): '''term : arg | PHPOPEN expr PHPCLOSE''' if len(p) == 2: p[0] = p[1] else: p[0] = p[2] def p_func(p): '''func : PHPFUNC PHPOPEN args PHPCLOSE''' p[0] = Node('func', [p[1], p[3]]) def p_args(p): '''args : | expr | args PHPCOMA expr''' if len(p) == 1: p[0] = Node('args', []) elif len(p) == 2: p[0] = Node('args', [p[1]]) else: p[0] = p[1].add_parts([p[3]]) def p_arg(p): '''arg : string | phpvar | PHPNUM | func''' p[0] = Node('arg', [p[1]]) def p_phpvar(p): '''phpvar : PHPVAR''' p[0] = Node('var', [p[1]]) def p_string(p): '''string : PHPSTRING str PHPSTRING''' p[0] = p[2] def p_str(p): '''str : | STR | str phpvar''' if len(p) == 1: p[0] = Node('str', ['']) elif len(p) == 2: p[0] = Node('str', [p[1]]) else: p[0] = p[1].add_parts([p[2]]) def p_error(p): print 'Unexpected token:', p parser = yacc.yacc() def build_tree(code): return parser.parse(code)
# coding=utf8 from parser import build_tree data = ''' <?php $val = 5; $result = substr( "foobar", 2*(7-$val) ); /* comment */ echo " : ", $result; ''' result = build_tree(data) print result
line: assign: $ val arg: five assign: $ result arg: func: substr args: arg: str: foobar *: arg: 2 -: arg: 7 arg: var: $ val echo: args: arg: str: this is our result: arg: var: $ result
Source: https://habr.com/ru/post/191252/
All Articles