// ************************************************************************** // * This file is part of the zenXML project. It is distributed under the * // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt * // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * // ************************************************************************** #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432 #define ZEN_XML_PARSER_HEADER_81248670213764583021432 #include #include //ptrdiff_t; req. on Linux #include #include "dom.h" #include "error.h" namespace zen { /** \file \brief Convert an XML document object model (class XmlDoc) to and from a byte stream representation. */ ///Save XML document as a byte stream /** \param doc Input XML document \param lineBreak Line break, default: carriage return + new line \param indent Indentation, default: four space characters \return Output byte stream */ std::string serialize(const XmlDoc& doc, const std::string& lineBreak = "\r\n", const std::string& indent = " "); //throw () ///Exception thrown due to an XML parsing error struct XmlParsingError : public XmlError { XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {} ///Input file row where the parsing error occured size_t row; //beginning with 0 ///Input file column where the parsing error occured size_t col; // }; ///Load XML document from a byte stream /** \param stream Input byte stream \param doc Output XML document \throw XmlParsingError */ void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError //---------------------------- implementation ---------------------------- //see: http://www.w3.org/TR/xml/ namespace implementation { inline std::pair hexify(unsigned char c) { auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F { assert(0 <= num && num <= 15); //guaranteed by design below! return static_cast(num <= 9 ? //no signed/unsigned char problem here! '0' + num : 'A' + (num - 10)); }; return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16)); } inline char unhexify(char high, char low) { auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15] { if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here! return hex - '0'; else if ('A' <= hex && hex <= 'F') return (hex - 'A') + 10; else if ('a' <= hex && hex <= 'f') return (hex - 'a') + 10; assert(false); return 0; }; return static_cast(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed) }; template inline std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex { std::string output; std::for_each(str.begin(), str.end(), [&](char c) { if (c == '&') // output += "&"; else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax output += "<"; else if (c == '>') // output += ">"; else if (pred(c)) { if (c == '\'') output += "'"; else if (c == '\"') output += """; else { output += "&#x"; const auto hexDigits = hexify(c); //hexify beats "printNumber("&#x%02X;", c)" by a nice factor of 3! output += hexDigits.first; output += hexDigits.second; output += ';'; } } else output += c; }); return output; } inline std::string normalizeName(const std::string& str) { return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; }); } inline std::string normalizeElementValue(const std::string& str) { return normalize(str, [](char c) { return static_cast(c) < 32; }); } inline std::string normalizeAttribValue(const std::string& str) { return normalize(str, [](char c) { return static_cast(c) < 32 || c == '\'' || c == '\"'; }); } namespace { std::string denormalize(const std::string& str) { std::string output; for (auto it = str.begin(); it != str.end(); ++it) { const char c = *it; if (c == '&') { auto checkEntity = [&](const char* placeholder, char realVal) -> bool { size_t strLen = strLength(placeholder); if (str.end() - it >= static_cast(strLen) && std::equal(it, it + strLen, placeholder)) { output += realVal; it += strLen - 1; return true; } return false; }; if (checkEntity("&", '&')) continue; if (checkEntity("<", '<')) continue; if (checkEntity(">", '>')) continue; if (checkEntity("'", '\'')) continue; if (checkEntity(""", '\"')) continue; if (str.end() - it >= 6 && it[1] == '#' && it[2] == 'x' && it[5] == ';') { output += unhexify(it[3], it[4]); it += 5; continue; //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!! } output += c; //unexpected char! } else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends { auto itNext = it + 1; if (itNext != str.end() && *itNext == '\n') ++it; output += '\n'; } else output += c; }; return output; } void serialize(const XmlElement& element, std::string& stream, const std::string& lineBreak, const std::string& indent, size_t indentLevel) { const std::string& nameFmt = normalizeName(element.getNameAs()); for (size_t i = 0; i < indentLevel; ++i) stream += indent; stream += '<' + nameFmt; auto attr = element.getAttributes(); for (auto it = attr.first; it != attr.second; ++it) stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + "\""; //no support for mixed-mode content auto iterPair = element.getChildren(); if (iterPair.first != iterPair.second) //structured element { stream += '>' + lineBreak; std::for_each(iterPair.first, iterPair.second, [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); }); for (size_t i = 0; i < indentLevel; ++i) stream += indent; stream += "' + lineBreak; } else { std::string value; element.getValue(value); if (!value.empty()) //value element stream += '>' + normalizeElementValue(value) + "' + lineBreak; else //empty element stream += "/>" + lineBreak; } } std::string serialize(const XmlDoc& doc, const std::string& lineBreak, const std::string& indent) { std::string version = doc.getVersionAs(); if (!version.empty()) version = " version=\"" + normalizeAttribValue(version) + "\""; std::string encoding = doc.getEncodingAs(); if (!encoding.empty()) encoding = " encoding=\"" + normalizeAttribValue(encoding) + "\""; std::string standalone = doc.getStandaloneAs(); if (!standalone.empty()) standalone = " standalone=\"" + normalizeAttribValue(standalone) + "\""; std::string output = "" + lineBreak; serialize(doc.root(), output, lineBreak, indent, 0); return output; } } } inline std::string serialize(const XmlDoc& doc, const std::string& lineBreak, const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); } /* Grammar for XML parser ------------------------------- document-expression: element-expression: element-expression: pm-expression element-list-expression: element-expression element-list-expression attributes-expression: string="string" attributes-expression pm-expression: string element-list-expression */ namespace implementation { struct Token { enum Type { TK_LESS, TK_GREATER, TK_LESS_SLASH, TK_SLASH_GREATER, TK_EQUAL, TK_QUOTE, TK_DECL_BEGIN, TK_DECL_END, TK_NAME, TK_END }; Token(Type t) : type(t) {} Token(const std::string& txt) : type(TK_NAME), name(txt) {} Type type; std::string name; //filled if type == TK_NAME }; class Scanner { public: Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin()) { if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8)) pos += strLength(BYTE_ORDER_MARK_UTF8); tokens.push_back(std::make_pair("", Token::TK_DECL_END)); tokens.push_back(std::make_pair("", Token::TK_SLASH_GREATER)); tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN! tokens.push_back(std::make_pair(">" , Token::TK_GREATER)); tokens.push_back(std::make_pair("=" , Token::TK_EQUAL)); tokens.push_back(std::make_pair("\"", Token::TK_QUOTE)); tokens.push_back(std::make_pair("\'", Token::TK_QUOTE)); } Token nextToken() //throw XmlParsingError { //skip whitespace pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); }); if (pos == stream_.end()) return Token::TK_END; for (auto it = tokens.begin(); it != tokens.end(); ++it) if (startsWith(pos, it->first)) { pos += it->first.size(); return it->second; } auto nameEnd = std::find_if(pos, stream_.end(), [](char c) { return c == '<' || c == '>' || c == '=' || c == '/' || c == '\'' || c == '\"' || zen::isWhiteSpace(c); }); if (nameEnd != pos) { std::string name(&*pos, nameEnd - pos); pos = nameEnd; return implementation::denormalize(name); } //unknown token throw XmlParsingError(posRow(), posCol()); } std::string extractElementValue() { auto it = std::find_if(pos, stream_.end(), [](char c) { return c == '<' || c == '>'; }); std::string output(pos, it); pos = it; return implementation::denormalize(output); } std::string extractAttributeValue() { auto it = std::find_if(pos, stream_.end(), [](char c) { return c == '<' || c == '>' || c == '\'' || c == '\"'; }); std::string output(pos, it); pos = it; return implementation::denormalize(output); } size_t posRow() const //current row beginning with 0 { const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines assert(crSum == 0 || nlSum == 0 || crSum == nlSum); return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win } size_t posCol() const //current col beginning with 0 { //seek beginning of line for (auto it = pos; it != stream_.begin(); ) { --it; if (*it == '\r' || *it == '\n') return pos - it - 1; } return pos - stream_.begin(); } private: Scanner(const Scanner&); Scanner& operator=(const Scanner&); bool startsWith(std::string::const_iterator it, const std::string& prefix) const { if (stream_.end() - it < static_cast(prefix.size())) return false; return std::equal(prefix.begin(), prefix.end(), it); } typedef std::vector > TokenList; TokenList tokens; const std::string stream_; std::string::const_iterator pos; }; class XmlParser { public: XmlParser(const std::string& stream) : scn(stream), tk(scn.nextToken()) {} void parse(XmlDoc& doc) //throw XmlParsingError { //declaration (optional) if (token().type == Token::TK_DECL_BEGIN) { nextToken(); while (token().type == Token::TK_NAME) { std::string attribName = token().name; nextToken(); consumeToken(Token::TK_EQUAL); expectToken(Token::TK_QUOTE); std::string attribValue = scn.extractAttributeValue(); nextToken(); consumeToken(Token::TK_QUOTE); if (attribName == "version") doc.setVersion(attribValue); else if (attribName == "encoding") doc.setEncoding(attribValue); else if (attribName == "standalone") doc.setStandalone(attribValue); } consumeToken(Token::TK_DECL_END); } XmlDoc dummy; XmlElement& elemTmp = dummy.root(); parseChildElements(elemTmp); auto iterPair = elemTmp.getChildren(); if (iterPair.first != iterPair.second) doc.root().swap(*iterPair.first); expectToken(Token::TK_END); }; private: XmlParser(const XmlParser&); XmlParser& operator=(const XmlParser&); void parseChildElements(XmlElement& parent) { while (token().type == Token::TK_LESS) { nextToken(); expectToken(Token::TK_NAME); std::string elementName = token().name; nextToken(); XmlElement& newElement = parent.addChild(elementName); parseAttributes(newElement); if (token().type == Token::TK_SLASH_GREATER) //empty element { nextToken(); continue; } expectToken(Token::TK_GREATER); std::string elementValue = scn.extractElementValue(); nextToken(); //no support for mixed-mode content if (token().type == Token::TK_LESS) //structured element parseChildElements(newElement); else //value element newElement.setValue(elementValue); consumeToken(Token::TK_LESS_SLASH); if (token().type != Token::TK_NAME || elementName != token().name) throw XmlParsingError(scn.posRow(), scn.posCol()); nextToken(); consumeToken(Token::TK_GREATER); } }; void parseAttributes(XmlElement& element) { while (token().type == Token::TK_NAME) { std::string attribName = token().name; nextToken(); consumeToken(Token::TK_EQUAL); expectToken(Token::TK_QUOTE); std::string attribValue = scn.extractAttributeValue(); nextToken(); consumeToken(Token::TK_QUOTE); element.setAttribute(attribName, attribValue); } } const Token& token() const { return tk; } void nextToken() { tk = scn.nextToken(); } void consumeToken(Token::Type t) //throw XmlParsingError { expectToken(t); //throw XmlParsingError nextToken(); } void expectToken(Token::Type t) //throw XmlParsingError { if (token().type != t) throw XmlParsingError(scn.posRow(), scn.posCol()); } Scanner scn; Token tk; }; } inline void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError { implementation::XmlParser(stream).parse(doc); //throw XmlParsingError } } #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432