diff options
author | Daniel Wilhelm <daniel@wili.li> | 2014-04-18 17:23:19 +0200 |
---|---|---|
committer | Daniel Wilhelm <daniel@wili.li> | 2014-04-18 17:23:19 +0200 |
commit | 0887aee8c54d0ed51bb2031431e2bcdafebb4c6e (patch) | |
tree | 69537ceb9787bb25ac363cc4e6cdaf0804d78363 /zenxml/parser.h | |
parent | 5.12 (diff) | |
download | FreeFileSync-0887aee8c54d0ed51bb2031431e2bcdafebb4c6e.tar.gz FreeFileSync-0887aee8c54d0ed51bb2031431e2bcdafebb4c6e.tar.bz2 FreeFileSync-0887aee8c54d0ed51bb2031431e2bcdafebb4c6e.zip |
5.13
Diffstat (limited to 'zenxml/parser.h')
-rw-r--r-- | zenxml/parser.h | 609 |
1 files changed, 609 insertions, 0 deletions
diff --git a/zenxml/parser.h b/zenxml/parser.h new file mode 100644 index 00000000..823cd793 --- /dev/null +++ b/zenxml/parser.h @@ -0,0 +1,609 @@ +// ************************************************************************** +// * This file is part of the zenXML project. It is distributed under the * +// * Boost Software License: http://www.boost.org/LICENSE_1_0.txt * +// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * +// ************************************************************************** + +#ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432 +#define ZEN_XML_PARSER_HEADER_81248670213764583021432 + +#include <cstdio> +#include <cstddef> //ptrdiff_t; req. on Linux +#include <zen/string_traits.h> +#include "dom.h" +#include "error.h" + +namespace zen +{ +/** +\file +\brief Convert an XML document object model (class XmlDoc) to and from a byte stream representation. +*/ + +///Save XML document as a byte stream +/** +\param doc Input XML document +\param lineBreak Line break, default: carriage return + new line +\param indent Indentation, default: four space characters +\return Output byte stream +*/ +std::string serialize(const XmlDoc& doc, + const std::string& lineBreak = "\r\n", + const std::string& indent = " "); //throw () + +///Exception thrown due to an XML parsing error +struct XmlParsingError : public XmlError +{ + XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {} + ///Input file row where the parsing error occured + size_t row; //beginning with 0 + ///Input file column where the parsing error occured + size_t col; // +}; + + +///Load XML document from a byte stream +/** +\param stream Input byte stream +\param doc Output XML document +\throw XmlParsingError +*/ +void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +//---------------------------- implementation ---------------------------- +//see: http://www.w3.org/TR/xml/ + +namespace implementation +{ +inline +std::pair<char, char> hexify(unsigned char c) +{ + auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F + { + assert(0 <= num && num <= 15); //guaranteed by design below! + return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here! + '0' + num : + 'A' + (num - 10)); + }; + return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16)); +} + + +inline +char unhexify(char high, char low) +{ + auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15] + { + if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here! + return hex - '0'; + else if ('A' <= hex && hex <= 'F') + return (hex - 'A') + 10; + else if ('a' <= hex && hex <= 'f') + return (hex - 'a') + 10; + assert(false); + return 0; + }; + return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed) +}; + + +template <class Predicate> inline +std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex +{ + std::string output; + std::for_each(str.begin(), str.end(), + [&](char c) + { + if (c == '&') // + output += "&"; + else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax + output += "<"; + else if (c == '>') // + output += ">"; + else if (pred(c)) + { + if (c == '\'') + output += "'"; + else if (c == '\"') + output += """; + else + { + output += "&#x"; + const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3! + output += hexDigits.first; + output += hexDigits.second; + output += ';'; + } + } + else + output += c; + }); + return output; +} + +inline +std::string normalizeName(const std::string& str) +{ + return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; }); +} + +inline +std::string normalizeElementValue(const std::string& str) +{ + return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; }); +} + +inline +std::string normalizeAttribValue(const std::string& str) +{ + return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; }); +} + + +namespace +{ +std::string denormalize(const std::string& str) +{ + std::string output; + for (auto it = str.begin(); it != str.end(); ++it) + { + const char c = *it; + + if (c == '&') + { + auto checkEntity = [&](const char* placeholder, char realVal) -> bool + { + size_t strLen = strLength(placeholder); + + if (str.end() - it >= static_cast<int>(strLen) && std::equal(it, it + strLen, placeholder)) + { + output += realVal; + it += strLen - 1; + return true; + } + return false; + }; + + if (checkEntity("&", '&')) + continue; + if (checkEntity("<", '<')) + continue; + if (checkEntity(">", '>')) + continue; + if (checkEntity("'", '\'')) + continue; + if (checkEntity(""", '\"')) + continue; + + if (str.end() - it >= 6 && + it[1] == '#' && + it[2] == 'x' && + it[5] == ';') + { + output += unhexify(it[3], it[4]); + it += 5; + continue; + //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!! + } + + output += c; //unexpected char! + } + else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends + { + auto itNext = it + 1; + if (itNext != str.end() && *itNext == '\n') + ++it; + output += '\n'; + } + else + output += c; + }; + return output; +} + + +void serialize(const XmlElement& element, std::string& stream, + const std::string& lineBreak, + const std::string& indent, + size_t indentLevel) +{ + const std::string& nameFmt = normalizeName(element.getNameAs<std::string>()); + + for (size_t i = 0; i < indentLevel; ++i) + stream += indent; + + stream += '<' + nameFmt; + + auto attr = element.getAttributes(); + for (auto it = attr.first; it != attr.second; ++it) + stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + "\""; + + //no support for mixed-mode content + auto iterPair = element.getChildren(); + if (iterPair.first != iterPair.second) //structured element + { + stream += '>' + lineBreak; + + std::for_each(iterPair.first, iterPair.second, + [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); }); + + for (size_t i = 0; i < indentLevel; ++i) + stream += indent; + stream += "</" + nameFmt + '>' + lineBreak; + } + else + { + std::string value; + element.getValue(value); + + if (!value.empty()) //value element + stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak; + else //empty element + stream += "/>" + lineBreak; + } +} + +std::string serialize(const XmlDoc& doc, + const std::string& lineBreak, + const std::string& indent) +{ + std::string version = doc.getVersionAs<std::string>(); + if (!version.empty()) + version = " version=\"" + normalizeAttribValue(version) + "\""; + + std::string encoding = doc.getEncodingAs<std::string>(); + if (!encoding.empty()) + encoding = " encoding=\"" + normalizeAttribValue(encoding) + "\""; + + std::string standalone = doc.getStandaloneAs<std::string>(); + if (!standalone.empty()) + standalone = " standalone=\"" + normalizeAttribValue(standalone) + "\""; + + std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak; + serialize(doc.root(), output, lineBreak, indent, 0); + return output; +} +} +} + +inline +std::string serialize(const XmlDoc& doc, + const std::string& lineBreak, + const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); } + +/* +Grammar for XML parser +------------------------------- +document-expression: + <?xml version="1.0" encoding="UTF-8" standalone="yes"?> + element-expression: + +element-expression: + <string attributes-expression/> + <string attributes-expression> pm-expression </string> + +element-list-expression: + <empty> + element-expression element-list-expression + +attributes-expression: + <empty> + string="string" attributes-expression + +pm-expression: + string + element-list-expression +*/ + +namespace implementation +{ +struct Token +{ + enum Type + { + TK_LESS, + TK_GREATER, + TK_LESS_SLASH, + TK_SLASH_GREATER, + TK_EQUAL, + TK_QUOTE, + TK_DECL_BEGIN, + TK_DECL_END, + TK_NAME, + TK_END + }; + + Token(Type t) : type(t) {} + Token(const std::string& txt) : type(TK_NAME), name(txt) {} + + Type type; + std::string name; //filled if type == TK_NAME +}; + +class Scanner +{ +public: + Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin()) + { + if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8)) + pos += strLength(BYTE_ORDER_MARK_UTF8); + + tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN)); + tokens.push_back(std::make_pair("?>", Token::TK_DECL_END)); + tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH)); + tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER)); + tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN! + tokens.push_back(std::make_pair(">" , Token::TK_GREATER)); + tokens.push_back(std::make_pair("=" , Token::TK_EQUAL)); + tokens.push_back(std::make_pair("\"", Token::TK_QUOTE)); + tokens.push_back(std::make_pair("\'", Token::TK_QUOTE)); + } + + Token nextToken() //throw XmlParsingError + { + //skip whitespace + pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); }); + + if (pos == stream_.end()) + return Token::TK_END; + + for (auto it = tokens.begin(); it != tokens.end(); ++it) + if (startsWith(pos, it->first)) + { + pos += it->first.size(); + return it->second; + } + + auto nameEnd = std::find_if(pos, stream_.end(), [](char c) + { + return c == '<' || + c == '>' || + c == '=' || + c == '/' || + c == '\'' || + c == '\"' || + zen::isWhiteSpace(c); + }); + + if (nameEnd != pos) + { + std::string name(&*pos, nameEnd - pos); + pos = nameEnd; + return implementation::denormalize(name); + } + + //unknown token + throw XmlParsingError(posRow(), posCol()); + } + + std::string extractElementValue() + { + auto it = std::find_if(pos, stream_.end(), [](char c) + { + return c == '<' || + c == '>'; + }); + std::string output(pos, it); + pos = it; + return implementation::denormalize(output); + } + + std::string extractAttributeValue() + { + auto it = std::find_if(pos, stream_.end(), [](char c) + { + return c == '<' || + c == '>' || + c == '\'' || + c == '\"'; + }); + std::string output(pos, it); + pos = it; + return implementation::denormalize(output); + } + + size_t posRow() const //current row beginning with 0 + { + const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns + const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines + assert(crSum == 0 || nlSum == 0 || crSum == nlSum); + return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win + } + + size_t posCol() const //current col beginning with 0 + { + //seek beginning of line + for (auto it = pos; it != stream_.begin(); ) + { + --it; + if (*it == '\r' || *it == '\n') + return pos - it - 1; + } + return pos - stream_.begin(); + } + +private: + Scanner(const Scanner&); + Scanner& operator=(const Scanner&); + + bool startsWith(std::string::const_iterator it, const std::string& prefix) const + { + if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size())) + return false; + return std::equal(prefix.begin(), prefix.end(), it); + } + + typedef std::vector<std::pair<std::string, Token::Type> > TokenList; + TokenList tokens; + + const std::string stream_; + std::string::const_iterator pos; +}; + + +class XmlParser +{ +public: + XmlParser(const std::string& stream) : + scn(stream), + tk(scn.nextToken()) {} + + void parse(XmlDoc& doc) //throw XmlParsingError + { + //declaration (optional) + if (token().type == Token::TK_DECL_BEGIN) + { + nextToken(); + + while (token().type == Token::TK_NAME) + { + std::string attribName = token().name; + nextToken(); + + consumeToken(Token::TK_EQUAL); + expectToken(Token::TK_QUOTE); + std::string attribValue = scn.extractAttributeValue(); + nextToken(); + + consumeToken(Token::TK_QUOTE); + + if (attribName == "version") + doc.setVersion(attribValue); + else if (attribName == "encoding") + doc.setEncoding(attribValue); + else if (attribName == "standalone") + doc.setStandalone(attribValue); + } + consumeToken(Token::TK_DECL_END); + } + + XmlDoc dummy; + XmlElement& elemTmp = dummy.root(); + parseChildElements(elemTmp); + + auto iterPair = elemTmp.getChildren(); + if (iterPair.first != iterPair.second) + doc.root().swap(*iterPair.first); + + expectToken(Token::TK_END); + }; + +private: + XmlParser(const XmlParser&); + XmlParser& operator=(const XmlParser&); + + void parseChildElements(XmlElement& parent) + { + while (token().type == Token::TK_LESS) + { + nextToken(); + + expectToken(Token::TK_NAME); + std::string elementName = token().name; + nextToken(); + + XmlElement& newElement = parent.addChild(elementName); + + parseAttributes(newElement); + + if (token().type == Token::TK_SLASH_GREATER) //empty element + { + nextToken(); + continue; + } + + expectToken(Token::TK_GREATER); + std::string elementValue = scn.extractElementValue(); + nextToken(); + + //no support for mixed-mode content + if (token().type == Token::TK_LESS) //structured element + parseChildElements(newElement); + else //value element + newElement.setValue(elementValue); + + consumeToken(Token::TK_LESS_SLASH); + + if (token().type != Token::TK_NAME || + elementName != token().name) + throw XmlParsingError(scn.posRow(), scn.posCol()); + nextToken(); + + consumeToken(Token::TK_GREATER); + } + }; + + void parseAttributes(XmlElement& element) + { + while (token().type == Token::TK_NAME) + { + std::string attribName = token().name; + nextToken(); + + consumeToken(Token::TK_EQUAL); + expectToken(Token::TK_QUOTE); + std::string attribValue = scn.extractAttributeValue(); + nextToken(); + + consumeToken(Token::TK_QUOTE); + element.setAttribute(attribName, attribValue); + } + } + + const Token& token() const { return tk; } + void nextToken() { tk = scn.nextToken(); } + + void consumeToken(Token::Type t) //throw XmlParsingError + { + expectToken(t); //throw XmlParsingError + nextToken(); + } + + void expectToken(Token::Type t) //throw XmlParsingError + { + if (token().type != t) + throw XmlParsingError(scn.posRow(), scn.posCol()); + } + + Scanner scn; + Token tk; +}; +} + +inline +void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError +{ + implementation::XmlParser(stream).parse(doc); //throw XmlParsingError +} +} + +#endif //ZEN_XML_PARSER_HEADER_81248670213764583021432 |