summaryrefslogtreecommitdiff
path: root/zenxml/parser.h
diff options
context:
space:
mode:
Diffstat (limited to 'zenxml/parser.h')
-rw-r--r--zenxml/parser.h609
1 files changed, 609 insertions, 0 deletions
diff --git a/zenxml/parser.h b/zenxml/parser.h
new file mode 100644
index 00000000..823cd793
--- /dev/null
+++ b/zenxml/parser.h
@@ -0,0 +1,609 @@
+// **************************************************************************
+// * This file is part of the zenXML project. It is distributed under the *
+// * Boost Software License: http://www.boost.org/LICENSE_1_0.txt *
+// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
+// **************************************************************************
+
+#ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
+#define ZEN_XML_PARSER_HEADER_81248670213764583021432
+
+#include <cstdio>
+#include <cstddef> //ptrdiff_t; req. on Linux
+#include <zen/string_traits.h>
+#include "dom.h"
+#include "error.h"
+
+namespace zen
+{
+/**
+\file
+\brief Convert an XML document object model (class XmlDoc) to and from a byte stream representation.
+*/
+
+///Save XML document as a byte stream
+/**
+\param doc Input XML document
+\param lineBreak Line break, default: carriage return + new line
+\param indent Indentation, default: four space characters
+\return Output byte stream
+*/
+std::string serialize(const XmlDoc& doc,
+ const std::string& lineBreak = "\r\n",
+ const std::string& indent = " "); //throw ()
+
+///Exception thrown due to an XML parsing error
+struct XmlParsingError : public XmlError
+{
+ XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
+ ///Input file row where the parsing error occured
+ size_t row; //beginning with 0
+ ///Input file column where the parsing error occured
+ size_t col; //
+};
+
+
+///Load XML document from a byte stream
+/**
+\param stream Input byte stream
+\param doc Output XML document
+\throw XmlParsingError
+*/
+void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//---------------------------- implementation ----------------------------
+//see: http://www.w3.org/TR/xml/
+
+namespace implementation
+{
+inline
+std::pair<char, char> hexify(unsigned char c)
+{
+ auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
+ {
+ assert(0 <= num && num <= 15); //guaranteed by design below!
+ return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
+ '0' + num :
+ 'A' + (num - 10));
+ };
+ return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
+}
+
+
+inline
+char unhexify(char high, char low)
+{
+ auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
+ {
+ if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
+ return hex - '0';
+ else if ('A' <= hex && hex <= 'F')
+ return (hex - 'A') + 10;
+ else if ('a' <= hex && hex <= 'f')
+ return (hex - 'a') + 10;
+ assert(false);
+ return 0;
+ };
+ return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
+};
+
+
+template <class Predicate> inline
+std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
+{
+ std::string output;
+ std::for_each(str.begin(), str.end(),
+ [&](char c)
+ {
+ if (c == '&') //
+ output += "&amp;";
+ else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
+ output += "&lt;";
+ else if (c == '>') //
+ output += "&gt;";
+ else if (pred(c))
+ {
+ if (c == '\'')
+ output += "&apos;";
+ else if (c == '\"')
+ output += "&quot;";
+ else
+ {
+ output += "&#x";
+ const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
+ output += hexDigits.first;
+ output += hexDigits.second;
+ output += ';';
+ }
+ }
+ else
+ output += c;
+ });
+ return output;
+}
+
+inline
+std::string normalizeName(const std::string& str)
+{
+ return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
+}
+
+inline
+std::string normalizeElementValue(const std::string& str)
+{
+ return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
+}
+
+inline
+std::string normalizeAttribValue(const std::string& str)
+{
+ return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
+}
+
+
+namespace
+{
+std::string denormalize(const std::string& str)
+{
+ std::string output;
+ for (auto it = str.begin(); it != str.end(); ++it)
+ {
+ const char c = *it;
+
+ if (c == '&')
+ {
+ auto checkEntity = [&](const char* placeholder, char realVal) -> bool
+ {
+ size_t strLen = strLength(placeholder);
+
+ if (str.end() - it >= static_cast<int>(strLen) && std::equal(it, it + strLen, placeholder))
+ {
+ output += realVal;
+ it += strLen - 1;
+ return true;
+ }
+ return false;
+ };
+
+ if (checkEntity("&amp;", '&'))
+ continue;
+ if (checkEntity("&lt;", '<'))
+ continue;
+ if (checkEntity("&gt;", '>'))
+ continue;
+ if (checkEntity("&apos;", '\''))
+ continue;
+ if (checkEntity("&quot;", '\"'))
+ continue;
+
+ if (str.end() - it >= 6 &&
+ it[1] == '#' &&
+ it[2] == 'x' &&
+ it[5] == ';')
+ {
+ output += unhexify(it[3], it[4]);
+ it += 5;
+ continue;
+ //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
+ }
+
+ output += c; //unexpected char!
+ }
+ else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
+ {
+ auto itNext = it + 1;
+ if (itNext != str.end() && *itNext == '\n')
+ ++it;
+ output += '\n';
+ }
+ else
+ output += c;
+ };
+ return output;
+}
+
+
+void serialize(const XmlElement& element, std::string& stream,
+ const std::string& lineBreak,
+ const std::string& indent,
+ size_t indentLevel)
+{
+ const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
+
+ for (size_t i = 0; i < indentLevel; ++i)
+ stream += indent;
+
+ stream += '<' + nameFmt;
+
+ auto attr = element.getAttributes();
+ for (auto it = attr.first; it != attr.second; ++it)
+ stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + "\"";
+
+ //no support for mixed-mode content
+ auto iterPair = element.getChildren();
+ if (iterPair.first != iterPair.second) //structured element
+ {
+ stream += '>' + lineBreak;
+
+ std::for_each(iterPair.first, iterPair.second,
+ [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
+
+ for (size_t i = 0; i < indentLevel; ++i)
+ stream += indent;
+ stream += "</" + nameFmt + '>' + lineBreak;
+ }
+ else
+ {
+ std::string value;
+ element.getValue(value);
+
+ if (!value.empty()) //value element
+ stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
+ else //empty element
+ stream += "/>" + lineBreak;
+ }
+}
+
+std::string serialize(const XmlDoc& doc,
+ const std::string& lineBreak,
+ const std::string& indent)
+{
+ std::string version = doc.getVersionAs<std::string>();
+ if (!version.empty())
+ version = " version=\"" + normalizeAttribValue(version) + "\"";
+
+ std::string encoding = doc.getEncodingAs<std::string>();
+ if (!encoding.empty())
+ encoding = " encoding=\"" + normalizeAttribValue(encoding) + "\"";
+
+ std::string standalone = doc.getStandaloneAs<std::string>();
+ if (!standalone.empty())
+ standalone = " standalone=\"" + normalizeAttribValue(standalone) + "\"";
+
+ std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
+ serialize(doc.root(), output, lineBreak, indent, 0);
+ return output;
+}
+}
+}
+
+inline
+std::string serialize(const XmlDoc& doc,
+ const std::string& lineBreak,
+ const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
+
+/*
+Grammar for XML parser
+-------------------------------
+document-expression:
+ <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+ element-expression:
+
+element-expression:
+ <string attributes-expression/>
+ <string attributes-expression> pm-expression </string>
+
+element-list-expression:
+ <empty>
+ element-expression element-list-expression
+
+attributes-expression:
+ <empty>
+ string="string" attributes-expression
+
+pm-expression:
+ string
+ element-list-expression
+*/
+
+namespace implementation
+{
+struct Token
+{
+ enum Type
+ {
+ TK_LESS,
+ TK_GREATER,
+ TK_LESS_SLASH,
+ TK_SLASH_GREATER,
+ TK_EQUAL,
+ TK_QUOTE,
+ TK_DECL_BEGIN,
+ TK_DECL_END,
+ TK_NAME,
+ TK_END
+ };
+
+ Token(Type t) : type(t) {}
+ Token(const std::string& txt) : type(TK_NAME), name(txt) {}
+
+ Type type;
+ std::string name; //filled if type == TK_NAME
+};
+
+class Scanner
+{
+public:
+ Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
+ {
+ if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
+ pos += strLength(BYTE_ORDER_MARK_UTF8);
+
+ tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
+ tokens.push_back(std::make_pair("?>", Token::TK_DECL_END));
+ tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
+ tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
+ tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
+ tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
+ tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
+ tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
+ tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
+ }
+
+ Token nextToken() //throw XmlParsingError
+ {
+ //skip whitespace
+ pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
+
+ if (pos == stream_.end())
+ return Token::TK_END;
+
+ for (auto it = tokens.begin(); it != tokens.end(); ++it)
+ if (startsWith(pos, it->first))
+ {
+ pos += it->first.size();
+ return it->second;
+ }
+
+ auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
+ {
+ return c == '<' ||
+ c == '>' ||
+ c == '=' ||
+ c == '/' ||
+ c == '\'' ||
+ c == '\"' ||
+ zen::isWhiteSpace(c);
+ });
+
+ if (nameEnd != pos)
+ {
+ std::string name(&*pos, nameEnd - pos);
+ pos = nameEnd;
+ return implementation::denormalize(name);
+ }
+
+ //unknown token
+ throw XmlParsingError(posRow(), posCol());
+ }
+
+ std::string extractElementValue()
+ {
+ auto it = std::find_if(pos, stream_.end(), [](char c)
+ {
+ return c == '<' ||
+ c == '>';
+ });
+ std::string output(pos, it);
+ pos = it;
+ return implementation::denormalize(output);
+ }
+
+ std::string extractAttributeValue()
+ {
+ auto it = std::find_if(pos, stream_.end(), [](char c)
+ {
+ return c == '<' ||
+ c == '>' ||
+ c == '\'' ||
+ c == '\"';
+ });
+ std::string output(pos, it);
+ pos = it;
+ return implementation::denormalize(output);
+ }
+
+ size_t posRow() const //current row beginning with 0
+ {
+ const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
+ const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
+ assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
+ return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
+ }
+
+ size_t posCol() const //current col beginning with 0
+ {
+ //seek beginning of line
+ for (auto it = pos; it != stream_.begin(); )
+ {
+ --it;
+ if (*it == '\r' || *it == '\n')
+ return pos - it - 1;
+ }
+ return pos - stream_.begin();
+ }
+
+private:
+ Scanner(const Scanner&);
+ Scanner& operator=(const Scanner&);
+
+ bool startsWith(std::string::const_iterator it, const std::string& prefix) const
+ {
+ if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size()))
+ return false;
+ return std::equal(prefix.begin(), prefix.end(), it);
+ }
+
+ typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
+ TokenList tokens;
+
+ const std::string stream_;
+ std::string::const_iterator pos;
+};
+
+
+class XmlParser
+{
+public:
+ XmlParser(const std::string& stream) :
+ scn(stream),
+ tk(scn.nextToken()) {}
+
+ void parse(XmlDoc& doc) //throw XmlParsingError
+ {
+ //declaration (optional)
+ if (token().type == Token::TK_DECL_BEGIN)
+ {
+ nextToken();
+
+ while (token().type == Token::TK_NAME)
+ {
+ std::string attribName = token().name;
+ nextToken();
+
+ consumeToken(Token::TK_EQUAL);
+ expectToken(Token::TK_QUOTE);
+ std::string attribValue = scn.extractAttributeValue();
+ nextToken();
+
+ consumeToken(Token::TK_QUOTE);
+
+ if (attribName == "version")
+ doc.setVersion(attribValue);
+ else if (attribName == "encoding")
+ doc.setEncoding(attribValue);
+ else if (attribName == "standalone")
+ doc.setStandalone(attribValue);
+ }
+ consumeToken(Token::TK_DECL_END);
+ }
+
+ XmlDoc dummy;
+ XmlElement& elemTmp = dummy.root();
+ parseChildElements(elemTmp);
+
+ auto iterPair = elemTmp.getChildren();
+ if (iterPair.first != iterPair.second)
+ doc.root().swap(*iterPair.first);
+
+ expectToken(Token::TK_END);
+ };
+
+private:
+ XmlParser(const XmlParser&);
+ XmlParser& operator=(const XmlParser&);
+
+ void parseChildElements(XmlElement& parent)
+ {
+ while (token().type == Token::TK_LESS)
+ {
+ nextToken();
+
+ expectToken(Token::TK_NAME);
+ std::string elementName = token().name;
+ nextToken();
+
+ XmlElement& newElement = parent.addChild(elementName);
+
+ parseAttributes(newElement);
+
+ if (token().type == Token::TK_SLASH_GREATER) //empty element
+ {
+ nextToken();
+ continue;
+ }
+
+ expectToken(Token::TK_GREATER);
+ std::string elementValue = scn.extractElementValue();
+ nextToken();
+
+ //no support for mixed-mode content
+ if (token().type == Token::TK_LESS) //structured element
+ parseChildElements(newElement);
+ else //value element
+ newElement.setValue(elementValue);
+
+ consumeToken(Token::TK_LESS_SLASH);
+
+ if (token().type != Token::TK_NAME ||
+ elementName != token().name)
+ throw XmlParsingError(scn.posRow(), scn.posCol());
+ nextToken();
+
+ consumeToken(Token::TK_GREATER);
+ }
+ };
+
+ void parseAttributes(XmlElement& element)
+ {
+ while (token().type == Token::TK_NAME)
+ {
+ std::string attribName = token().name;
+ nextToken();
+
+ consumeToken(Token::TK_EQUAL);
+ expectToken(Token::TK_QUOTE);
+ std::string attribValue = scn.extractAttributeValue();
+ nextToken();
+
+ consumeToken(Token::TK_QUOTE);
+ element.setAttribute(attribName, attribValue);
+ }
+ }
+
+ const Token& token() const { return tk; }
+ void nextToken() { tk = scn.nextToken(); }
+
+ void consumeToken(Token::Type t) //throw XmlParsingError
+ {
+ expectToken(t); //throw XmlParsingError
+ nextToken();
+ }
+
+ void expectToken(Token::Type t) //throw XmlParsingError
+ {
+ if (token().type != t)
+ throw XmlParsingError(scn.posRow(), scn.posCol());
+ }
+
+ Scanner scn;
+ Token tk;
+};
+}
+
+inline
+void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError
+{
+ implementation::XmlParser(stream).parse(doc); //throw XmlParsingError
+}
+}
+
+#endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
bgstack15