if
appears, allowing you to declare variables right in the block header. This is quite convenient because the constructions of the form Foo foo = make_foo(); if(foo.is_nice()) { // do work with foo } // never use foo again // foo gets deleted
if(Foo foo = make_foo(); foo.is_nice()) { // do work with foo } // foo gets deleted // never use foo again (well, you can't anyway)
if
in Python code as much as I do and want to learn how to quickly write simple parsers, then welcome to cat. In this article we will try to write a short and elegant parser for JSON in Python 2 (without any additional modules, of course). root ::= value value ::= string | number | object | array | 'true' | 'false' | 'null' array ::= '[' ']' | '[' comma-separated-values ']' comma-separated-values ::= value | value ',' comma-separated-values object ::= '{' '}' | '{' comma-separated-keyvalues '}' comma-separated-keyvalues ::= keyvalue | keyvalue ',' comma-separated-keyvalues keyvalue ::= string ':' value
string
and number
- they, along with all the strings in quotes, will be our tokens. import re # re.DOTALL number_regex = re.compile(r"(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(.*)", re.DOTALL) def parse_number(src): match = number_regex.match(src) if match is not None: number, src = match.groups() return eval(number), src # eval - , string_regex = re.compile(r"('(?:[^\\']|\\['\\/bfnrt]|\\u[0-9a-fA-F]{4})*?')\s*(.*)", re.DOTALL) def parse_string(src): match = string_regex.match(src) if match is not None: string, src = match.groups() return eval(string), src # JSON' # ,
def parse_word(word, value=None): l = len(word) def result(src): # .lower() case-insensitive ! if src.startswith(word): # if! ! return value, src[l:].lstrip() # lstrip , . result.__name__ = "parse_%s" % word # return result parse_true = parse_word("true", True) parse_false = parse_word("false", False) parse_null = parse_word("null", None)
None
for failure.parse_value
function look like for the grammar above? Usually something like this: def parse_value(src): # match = parse_string(src) if match is not None: # ! return match # ; match = parse_number(src) if match is not None: return match # . ... # ...
if
got me!return
with yield
! Now they return the generators — empty if the parsing failed, and with exactly one element if successful. Yes, we are expanding our principle number 2 by 90 degrees: we will now write all our functions in this style: number_regex = re.compile(r"(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(.*)", re.DOTALL) def parse_number(src): match = number_regex.match(src) if match is not None: number, src = match.groups() yield eval(number), src # yield, . # , . string_regex = re.compile(r"('(?:[^\\']|\\['\\/bfnrt]|\\u[0-9a-fA-F]{4})*?')\s*(.*)", re.DOTALL) def parse_string(src): match = string_regex.match(src) if match is not None: string, src = match.groups() yield eval(string), src def parse_word(word, value=None): l = len(word) def result(src): if src.startswith(word): yield value, src[l:].rstrip() result.__name__ = "parse_%s" % word return result # -, yield' parse_true = parse_word("true", True) parse_false = parse_word("false", False) parse_null = parse_word("null", None)
parse_value
turn parse_value
? At first glance, in something like this: def parse_value(src): for match in parse_string(src): yield match return for match in parse_number(src): yield match return # ...
# itertools.chain # , from itertools import chain def parse_value(src): for match in chain( parse_string(src), parse_number(src), parse_array(src), parse_object(src), parse_true(src), parse_false(src), parse_null(src), ): # , yield match return
return
ensures that the extra work will not be performed if somewhere in the middle of the list the parsing is successful.parse_array
function. It should look something like this: parse_left_square_bracket = parse_word("[") parse_right_square_bracket = parse_word("]") def parse_array(src): # tsrc, # "" for _, tsrc in parse_left_square_bracket(src): for _, tsrc in parse_right_square_bracket(tsrc): # , '[' ']' yield [], tsrc return # src -- for _, src in parse_left_square_bracket(src): for items, src in parse_comma_separated_values(src): for _, src in parse_right_square_bracket(src): yield items, src # yield,
if
, as promised, but something is still wrong ... Let's write a small helper function that will help us connect the parser functions in sequence, just like the chain
helped connect them in the "or" mode. This function will have to carefully take all the results and return all the first elements of the results (analysis results) and the last second element (the remaining unanalyzed part of the line). My version looks like this: def sequence(*funcs): if len(funcs) == 0: # , if' def result(src): yield (), src return result def result(src): for arg1, src in funcs[0](src): for others, src in sequence(*funcs[1:])(src): yield (arg1,) + others, src # return result
parse_left_square_bracket = parse_word("[") parse_right_square_bracket = parse_word("]") parse_empty_array = sequence(parse_left_square_bracket, parse_right_square_bracket) def parse_array(src): for _, src in parse_empty_array(src): # , , [] yield [], src return # return , # {} {"a": 1} for (_, items, _), src in sequence( parse_left_square_bracket, parse_comma_separated_values, parse_right_square_bracket, )(src): yield items, src # yield,
parse_comma_separated_values
- just spit: parse_comma = parse_word(",") def parse_comma_separated_values(src): for (value, _, values), src in sequence( parse_value, parse_comma, parse_comma_separated_values # , if? )(src): yield [value] + values, src return for value, src in parse_value(src): yield [value], src
parse_comma
does not find another comma, and the subsequent parse_comma_separated_values
will not parse_comma_separated_values
executed anymore. parse_left_curly_bracket = parse_word("{") parse_right_curly_bracket = parse_word("}") parse_empty_object = sequence(parse_left_curly_bracket, parse_right_curly_bracket) def parse_object(src): for _, src in parse_empty_object(src): yield {}, src return for (_, items, _), src in sequence( parse_left_curly_bracket, parse_comma_separated_keyvalues, parse_right_curly_bracket, )(src): yield items, src parse_colon = parse_word(":") def parse_keyvalue(src): for (key, _, value), src in sequence( parse_string, parse_colon, parse_value )(src): yield {key: value}, src def parse_comma_separated_keyvalues(src): for (keyvalue, _, keyvalues), src in sequence( parse_keyvalue, parse_comma, parse_comma_separated_keyvalues, # , )(src): keyvalue.update(keyvalues) yield keyvalue, src return for keyvalue, src in parse_keyvalue(src): # , yield keyvalue, src
def parse(s): s = s.strip() # , match = list(parse_value(s)) if len(match) != 1: # - - . :) raise ValueError("not a valid JSON string") result, src = match[0] if src.strip(): # , - . . raise ValueError("not a valid JSON string") return result
from itertools import chain import re def sequence(*funcs): if len(funcs) == 0: def result(src): yield (), src return result def result(src): for arg1, src in funcs[0](src): for others, src in sequence(*funcs[1:])(src): yield (arg1,) + others, src return result number_regex = re.compile(r"(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(.*)", re.DOTALL) def parse_number(src): match = number_regex.match(src) if match is not None: number, src = match.groups() yield eval(number), src string_regex = re.compile(r"('(?:[^\\']|\\['\\/bfnrt]|\\u[0-9a-fA-F]{4})*?')\s*(.*)", re.DOTALL) def parse_string(src): match = string_regex.match(src) if match is not None: string, src = match.groups() yield eval(string), src def parse_word(word, value=None): l = len(word) def result(src): if src.startswith(word): yield value, src[l:].lstrip() result.__name__ = "parse_%s" % word return result parse_true = parse_word("true", True) parse_false = parse_word("false", False) parse_null = parse_word("null", None) def parse_value(src): for match in chain( parse_string(src), parse_number(src), parse_array(src), parse_object(src), parse_true(src), parse_false(src), parse_null(src), ): yield match return parse_left_square_bracket = parse_word("[") parse_right_square_bracket = parse_word("]") parse_empty_array = sequence(parse_left_square_bracket, parse_right_square_bracket) def parse_array(src): for _, src in parse_empty_array(src): yield [], src return for (_, items, _), src in sequence( parse_left_square_bracket, parse_comma_separated_values, parse_right_square_bracket, )(src): yield items, src parse_comma = parse_word(",") def parse_comma_separated_values(src): for (value, _, values), src in sequence( parse_value, parse_comma, parse_comma_separated_values )(src): yield [value] + values, src return for value, src in parse_value(src): yield [value], src parse_left_curly_bracket = parse_word("{") parse_right_curly_bracket = parse_word("}") parse_empty_object = sequence(parse_left_curly_bracket, parse_right_curly_bracket) def parse_object(src): for _, src in parse_empty_object(src): yield {}, src return for (_, items, _), src in sequence( parse_left_curly_bracket, parse_comma_separated_keyvalues, parse_right_curly_bracket, )(src): yield items, src parse_colon = parse_word(":") def parse_keyvalue(src): for (key, _, value), src in sequence( parse_string, parse_colon, parse_value )(src): yield {key: value}, src def parse_comma_separated_keyvalues(src): for (keyvalue, _, keyvalues), src in sequence( parse_keyvalue, parse_comma, parse_comma_separated_keyvalues, )(src): keyvalue.update(keyvalues) yield keyvalue, src return for keyvalue, src in parse_keyvalue(src): yield keyvalue, src def parse(s): s = s.strip() match = list(parse_value(s)) if len(match) != 1: raise ValueError("not a valid JSON string") result, src = match[0] if src.strip(): raise ValueError("not a valid JSON string") return result
>>> import my_json >>> my_json.parse("null") >>> my_json.parse("true") True >>> my_json.parse("false") False >>> my_json.parse("0.31415926E1") 3.1415926 >>> my_json.parse("[1, true, '1']") [1, True, '1'] >>> my_json.parse("{}") {} >>> my_json.parse("{'a': 1, 'b': null}") {'a': 1, 'b': None}
chain
ov and sequence
ov. Fortunately, this is not so inconvenient in the considered approach, as it may seem. So, if you need to try to parse an optional construction and make an action depending on its presence, you can write: for stuff, src in parse_optional_stuff(src): # -- break # else else: # -- pass
else
block of cycles, which is executed if the cycle has reached the end without a break
. It doesn’t look as attractive as our code in the article, but it’s definitely not worse than those if
we’ve got rid of so gracefully.Source: https://habr.com/ru/post/309242/
All Articles