<> = <_1> <_> <_2>
<_1> = <> (<_> | <_>) <>
<> = <_1> "+" <_2> <_1> = <> ("*" | "/") <>
(* document *) <document> = {<document_part>} <END> <document_part> = <block> | <empty_tag> | <comment> | <macro_tag> | <text> <block> = <opening_tag> {<document_part>} <closing_tag> (* tags *) <opening_tag> = "<" {<ws>} <block_tag_name> [<attributes_list>] {<ws>} ">" <closing_tag> = "<" "/" {<ws>} <block_tag_name> {<ws>} ">" <empty_tag> = "<" "!" {<ws>} <empty_tag_name> [<attributes_list] {<ws>} ["/"] ">" <comment> = "<" "!" "--" <comment_text> "--" ">" <macro_tag> = "<" "?" <macro_text> "?" ">" <block_tag_name> = "a" | "abbr" | "address" | "article" | "aside" | "audio" | "b" | "bdo" | "blockquote" | "body" | "button" | "canvas" | "caption" | "cite" | "code" | "colgroup" | "data" | "datalist" | "dd" | "del" | "details" | "dfn" | "dialog" | "div" | "dl" | "dt" | "em" | "fieldset" | "figcaption" | "figure" | "footer" | "form" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "head" | "header" | "html" | "i" | "iframe" | "ins" | "kbd" | "label" | "legend" | "li" | "main" | "map" | "mark" | "meter" | "nav" | "noscript" | "object" | "ol" | "optgroup" | "option" | "output" | "p" | "picture" | "pre" | "progress" | "q" | "ruby" | "rb" | "rt" | "rtc" | "rp" | "s" | "samp" | "script" | "section" | "select" | "small" | "span" | "strong" | "style" | "sub" | "summary" | "sup" | "table" | "tbody" | "td" | "template" | "textarea" | "tfoot" | "th" | "thead" | "time" | "title" | "tr" | "track" | "u" | "ul" | "var" | "video" <empty_tag_name> = "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "menuitem" | "meta" | "param" | "source" | "track" | "wbr" (* attributes *) <attributes_list> = <ws> {<ws>} <attribute> {<ws> {<ws>} <attribute>} <attribute> = <empty_attribute> | <unquoted_attribute> | <single_quoted_attribute> | <double_quoted_attribute> <empty_attribute> = <attribute_name> <unquoted_attribute> = <attribute_name> {<ws>} "=" {<ws>} <unquoted_attribute_value> <single_quoted_attribute> = <attribute_name> {<ws>} "=" {<ws>} "'" <single_quoted_attribute_value> "'" <double_quoted_attribute> = <attribute_name> {<ws>} "=" {<ws>} "\"" <double_quoted_attribute_value> "\"" <attribute_name> = (<letter> | <digit>) {<letter> | <digit>} {* attribute values *) <unquoted_attribute_value> = /^[\s"'=<>/]/ {/^[\s"'=<>/]/} <single_quoted_attribute_value> = /^[']/ {/^[']/} <double_quoted_attribute_value> = /^["]/ {/^["]/} (* nonterminals *) <text> = {/^[<>]/} <comment_text> = ... <macro_text> = ... <letter> = /[a-zA-Z]/ <digit> = /[0-9]/ <ws> = " " | "\t" | "\n" (* terminals *) "<", ">", "/", "!", "?", " ", "\t", "\n"
enum Token_type { END = 1, TEXT = 2, OPENING_BLOCK_TAG_NAME = 4, CLOSING_BLOCK_TAG_NAME = 8, EMPTY_TAG_NAME = 16, COMMENT = 32, MACRO_TAG = 64, ATTRIBUTE_NAME = 128, UNQUOTED_ATTRIBUTE_VALUE = 256, SINGLE_QUOTED_ATTRIBUTE_VALUE = 512, DOUBLE_QUOTED_ATTRIBUTE_VALUE = 1024 };
void Lexer::process (const char &c) { switch (curr_token_type) { case END: { throw string("unexpected ending!"); break; } case TEXT: { if (c == '>') throw string("unexpected symbol: \">\"!"); else if (c == '<') { if (!buffer.empty()) { add(buffer, TEXT); buffer.clear(); } curr_token_type = OPENING_BLOCK_TAG_NAME | CLOSING_BLOCK_TAG_NAME | EMPTY_TAG_NAME | COMMENT | MACRO_TAG; } else buffer.push_back(c); break; } case OPENING_BLOCK_TAG_NAME: { throw string("error!"); break; } case CLOSING_BLOCK_TAG_NAME: { if (c == '<') throw string("unexpected symbol: \"<\"!"); else if (c == '/') throw string("unexpected symbol: \"<\"!"); else if (c == '!') throw string("unexpected symbol: \"!\"!"); else if (c == '?') throw string("unexpected symbol: \"?\"!"); else if (c == ' ') throw string("unexpected symbol: \" \"!"); else if (c == '\t') throw string("unexpected symbol: \"\\t\"!"); else if (c == '\n') throw string("unexpected symbol: \"\\n\"!"); else if (c == '>') { for (unsigned int i(0); i < BLOCK_TAGS_COUNT; i++) if (buffer == block_tags[i]) { add(buffer, CLOSING_BLOCK_TAG_NAME); buffer.clear(); curr_token_type = TEXT; break; } } else buffer.push_back(c); break; } case EMPTY_TAG_NAME: { throw string("error!"); break; } case COMMENT: { ... break; } case MACRO_TAG: { ... break; } case OPENING_BLOCK_TAG_NAME | CLOSING_BLOCK_TAG_NAME | EMPTY_TAG_NAME | COMMENT | MACRO_TAG: { ... break; } case EMPTY_TAG_NAME | COMMENT: { ... break; } case ATTRIBUTE_NAME: { ... break; } case ATTRIBUTE_NAME | UNQUOTED_ATTRIBUTE_VALUE | SINGLE_QUOTED_ATTRIBUTE_VALUE | DOUBLE_QUOTED_ATTRIBUTE_VALUE: { ... break; } case UNQUOTED_ATTRIBUTE_VALUE | SINGLE_QUOTED_ATTRIBUTE_VALUE | DOUBLE_QUOTED_ATTRIBUTE_VALUE: { ... break; } case UNQUOTED_ATTRIBUTE_VALUE: { ... break; } case SINGLE_QUOTED_ATTRIBUTE_VALUE: { ... break; } case DOUBLE_QUOTED_ATTRIBUTE_VALUE: { ... break; } } }
void Lexer::disassemble (ifstream &file) { tokens_count = 0; curr_token_type = 0; unsigned long line(1), pos(1); try { char c; curr_token_type = TEXT; while ((c = file.get()) != EOF) { if (c == '\n') { pos = 1; line++; } else pos++; process(c); } if (buffer.size() != 0) { if (!(curr_token_type | TEXT)) throw string("text was expected!"); add(buffer, TEXT); buffer.clear(); } add("", END); } catch (const string &error) { throw string("lexer: " + to_string(line) + "," + to_string(pos) + ": " + error); } }
<!DOCTYPE html> <html lang="ru"> <head> <meta http-equiv="content-type" content="text/html" charset="utf-8" /> <meta name="author" content="Interquadro" /> <meta name="description" content="" /> <meta name="keywords" content=""> <meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="format-detection" content="telephone=no" /> <meta http-equiv="x-rim-auto-match" content="telephone=none" /> <meta name="referrer" content="no-referrer" /> <meta name="_suburl" content="" /> <title></title> <link rel="shortcut icon" href=".ico" /> <link rel="stylesheet" type="text/css" href=".css" title="" /> <!--[if lt IE 9]> <script src="http://html5shiv.googlecode.com/svn/trunk/html5-els.js"></script> <![endif]--> </head> <body> <header> <div id="intro"> </div> </header> <nav> <ul id="nav"> <li class="nav"><a href="#"> </a></li> <li class="nav"><a href="#"> </a></li> <li class="nav"><a href=""> </a></li> </ul> </nav> <main id="content"> <?php ?> </main> <footer> <hr /> <small id="copyright">Copyright © 2019. .</small> </footer> </body> </html>
["! DOCTYPE": EMPTY_TAG_NAME] ["html": ATTRIBUTE_NAME] [" ": TEXT] ["html": OPENING_BLOCK_TAG_NAME] ["lang": ATTRIBUTE_NAME] ["ru": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["head": OPENING_BLOCK_TAG_NAME] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["http-equiv": ATTRIBUTE_NAME] ["content-type": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["text / html": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["charset": ATTRIBUTE_NAME] ["utf-8": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["author": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["Interquadro": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["description": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["keywords": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["viewport": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["width = device-width, initial-scale = 1": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["format-detection": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["telephone = no": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["http-equiv": ATTRIBUTE_NAME] ["x-rim-auto-match": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["telephone = none": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["referrer": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["no-referrer": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["meta": EMPTY_TAG_NAME] ["name": ATTRIBUTE_NAME] ["_suburl": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["content": ATTRIBUTE_NAME] ["": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["title": OPENING_BLOCK_TAG_NAME] ["title": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["link": EMPTY_TAG_NAME] ["rel": ATTRIBUTE_NAME] ["shortcut icon": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["href": ATTRIBUTE_NAME] [".ico": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["link": EMPTY_TAG_NAME] ["rel": ATTRIBUTE_NAME] ["stylesheet": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["type": ATTRIBUTE_NAME] ["text / css": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["href": ATTRIBUTE_NAME] [".css": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["title": ATTRIBUTE_NAME] ["": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["[if lt IE 9]> <script src = "http://html5shiv.googlecode.com/svn/trunk/html5-els.js"> </ script> <! [endif] ": COMMENT] [" ": TEXT] ["head": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["body": OPENING_BLOCK_TAG_NAME] [" ": TEXT] ["header": OPENING_BLOCK_TAG_NAME] [" ": TEXT] ["div": OPENING_BLOCK_TAG_NAME] ["id": ATTRIBUTE_NAME] ["intro": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["div": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["header": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["nav": OPENING_BLOCK_TAG_NAME] [" ": TEXT] ["ul": OPENING_BLOCK_TAG_NAME] ["id": ATTRIBUTE_NAME] ["nav": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["li": OPENING_BLOCK_TAG_NAME] ["class": ATTRIBUTE_NAME] ["nav": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["a": OPENING_BLOCK_TAG_NAME] ["href": ATTRIBUTE_NAME] ["#": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["Home": TEXT] ["a": CLOSING_BLOCK_TAG_NAME] ["li": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["li": OPENING_BLOCK_TAG_NAME] ["class": ATTRIBUTE_NAME] ["nav": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["a": OPENING_BLOCK_TAG_NAME] ["href": ATTRIBUTE_NAME] ["#": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["Review": TEXT] ["a": CLOSING_BLOCK_TAG_NAME] ["li": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["li": OPENING_BLOCK_TAG_NAME] ["class": ATTRIBUTE_NAME] ["nav": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["a": OPENING_BLOCK_TAG_NAME] ["href": ATTRIBUTE_NAME] ["": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["Help": TEXT] ["a": CLOSING_BLOCK_TAG_NAME] ["li": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["ul": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["nav": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["main": OPENING_BLOCK_TAG_NAME] ["id": ATTRIBUTE_NAME] ["content": DOUBLE_QUOTED_ATTRIBUTE_VALUE] [" ": TEXT] ["php": MACRO_TAG] [" ": TEXT] ["main": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["footer": OPENING_BLOCK_TAG_NAME] [" ": TEXT] ["hr": EMPTY_TAG_NAME] [" ": TEXT] ["small": OPENING_BLOCK_TAG_NAME] ["id": ATTRIBUTE_NAME] ["copyright": DOUBLE_QUOTED_ATTRIBUTE_VALUE] ["Copyright © 2019. All Rights Reserved." : TEXT] ["small": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["footer": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["body": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["html": CLOSING_BLOCK_TAG_NAME] [" ": TEXT] ["": END]
void Parser::parse (const Lexer &lexer) { Block * open_block = (Block*) tree; Node * last_node = (Node*) tree; try { unsigned long long size = lexer.count(); for (unsigned long long i(0); i < size-2; i++) { switch (lexer[i].type) { case Lexer::TEXT: { for (unsigned int j(0); j < TEXT_TAGS_COUNT; j++) if (open_block->get_name() == text_tags[j]) last_node = open_block->add("TEXT", lexer[i].lexeme); break; } case Lexer::OPENING_BLOCK_TAG_NAME: { last_node = open_block = open_block->open(lexer[i].lexeme); break; } case Lexer::CLOSING_BLOCK_TAG_NAME: { if (lexer[i].lexeme != open_block->get_name()) throw string("unexpected closing tag: </" + lexer[i].lexeme + ">"); open_block = open_block->close(); break; } case Lexer::EMPTY_TAG_NAME: { last_node = open_block->add(lexer[i].lexeme); break; } case Lexer::COMMENT: { last_node = open_block->add("COMMENT", lexer[i].lexeme); break; } case Lexer::MACRO_TAG: { last_node = open_block->add("MACRO", lexer[i].lexeme); break; } case Lexer::ATTRIBUTE_NAME: { last_node->add_attr(lexer[i].lexeme, lexer[i].lexeme); break; } case Lexer::UNQUOTED_ATTRIBUTE_VALUE: { last_node->set_last_attr(lexer[i].lexeme); break; } case Lexer::SINGLE_QUOTED_ATTRIBUTE_VALUE: { last_node->set_last_attr(lexer[i].lexeme); break; } case Lexer::DOUBLE_QUOTED_ATTRIBUTE_VALUE: { last_node->set_last_attr(lexer[i].lexeme); break; } case Lexer::END: { if (open_block->get_type() != Node::ROOT) throw string("unexpected ending!"); open_block->close(); } } } } catch (const string &error) { throw string("parser: " + error); } }
| + - <ROOT> | + - <! DOCTYPE> | + - <html> | + - <head> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <meta> | | | + - <title> | | | + - <link> | | | + - <link> | | | + - <COMMENT> | + - <body> | + - <header> | | | + - <div> | + - <nav> | | | + - <ul> | | | + - <li> | | | | | + - <a> | | | + - <li> | | | | | + - <a> | | | + - <li> | | | + - <a> | + - <main> | | | + - <MACRO> | + - <footer> | + - <hr> | + - <small>
Source: https://habr.com/ru/post/442964/
All Articles