From 767bb3951c65e38627cb0bbad9a3756e1cda2520 Mon Sep 17 00:00:00 2001 From: Daniel Wilhelm Date: Fri, 18 Apr 2014 17:30:42 +0200 Subject: 6.1 --- zenxml/doc/parser_8h_source.html | 1231 +++++++++++++++++++------------------- 1 file changed, 622 insertions(+), 609 deletions(-) (limited to 'zenxml/doc/parser_8h_source.html') diff --git a/zenxml/doc/parser_8h_source.html b/zenxml/doc/parser_8h_source.html index 97bdc7e3..d049c188 100644 --- a/zenxml/doc/parser_8h_source.html +++ b/zenxml/doc/parser_8h_source.html @@ -3,46 +3,36 @@ + zen::Xml: parser.h Source File - - - - + + - + -
- - +
- - - - - -
zen::Xml -
Simple C++ XML Processing
- - + + @@ -51,7 +41,6 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • Main Page
  • Namespaces
  • Classes
  • -
  • Files
  • @@ -70,18 +59,12 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
  • - -
    - All Classes Namespaces Functions Variables
    + All Classes Namespaces Functions Variables Pages
    @@ -90,598 +73,628 @@ var searchBox = new SearchBox("searchBox", "search",false,'Search');
    + +
    parser.h
    -
    00001 // **************************************************************************
    -00002 // * This file is part of the zen::Xml project. It is distributed under the *
    -00003 // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt           *
    -00004 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved        *
    -00005 // **************************************************************************
    -00006 
    -00007 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
    -00008 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
    -00009 
    -00010 #include <cstdio>
    -00011 #include <cstddef> //ptrdiff_t; req. on Linux
    -00012 #include <zen/string_traits.h>
    -00013 #include "dom.h"
    -00014 #include "error.h"
    -00015 
    -00016 namespace zen
    -00017 {
    -00023 
    -00024 
    -00030 std::string serialize(const XmlDoc& doc,
    -00031                       const std::string& lineBreak = "\r\n",
    -00032                       const std::string& indent = "    "); //throw ()
    -00033 
    -00035 struct XmlParsingError : public XmlError
    -00036 {
    -00037     XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
    -00039     size_t row; //beginning with 0
    -00041     size_t col; //
    -00042 };
    -00043 
    -00044 
    -00046 
    -00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError
    -00052 
    -00053 
    -00054 
    -00055 
    -00056 
    -00057 
    -00058 
    -00059 
    -00060 
    -00061 
    -00062 
    -00063 
    -00064 
    -00065 
    -00066 
    -00067 
    -00068 
    -00069 
    -00070 
    -00071 
    -00072 //---------------------------- implementation ----------------------------
    -00073 //see: http://www.w3.org/TR/xml/
    -00074 
    -00075 namespace implementation
    -00076 {
    -00077 inline
    -00078 std::pair<char, char> hexify(unsigned char c)
    -00079 {
    -00080     auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
    -00081     {
    -00082         assert(0 <= num&&  num <= 15); //guaranteed by design below!
    -00083         return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
    -00084         '0' + num :
    -00085         'A' + (num - 10));
    -00086     };
    -00087     return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
    -00088 }
    -00089 
    -00090 
    -00091 inline
    -00092 char unhexify(char high, char low)
    -00093 {
    -00094     auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
    -00095     {
    -00096         if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
    -00097             return hex - '0';
    -00098         else if ('A' <= hex && hex <= 'F')
    -00099             return (hex - 'A') + 10;
    -00100         else if ('a' <= hex && hex <= 'f')
    -00101             return (hex - 'a') + 10;
    -00102         assert(false);
    -00103         return 0;
    -00104     };
    -00105     return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
    -00106 };
    -00107 
    -00108 
    -00109 template <class Predicate> inline
    -00110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
    -00111 {
    -00112     std::string output;
    -00113     std::for_each(str.begin(), str.end(),
    -00114                   [&](char c)
    -00115     {
    -00116         if (c == '&')      //
    -00117             output += "&amp;";
    -00118         else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
    -00119             output += "&lt;";
    -00120         else if (c == '>') //
    -00121             output += "&gt;";
    -00122         else if (pred(c))
    -00123         {
    -00124             if (c == '\'')
    -00125                 output += "&apos;";
    -00126             else if (c == '\"')
    -00127                 output += "&quot;";
    -00128             else
    -00129             {
    -00130                 output += "&#x";
    -00131                 const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
    -00132                 output += hexDigits.first;
    -00133                 output += hexDigits.second;
    -00134                 output += ';';
    -00135             }
    -00136         }
    -00137         else
    -00138             output += c;
    -00139     });
    -00140     return output;
    -00141 }
    -00142 
    -00143 inline
    -00144 std::string normalizeName(const std::string& str)
    -00145 {
    -00146     return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
    -00147 }
    -00148 
    -00149 inline
    -00150 std::string normalizeElementValue(const std::string& str)
    -00151 {
    -00152     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
    -00153 }
    -00154 
    -00155 inline
    -00156 std::string normalizeAttribValue(const std::string& str)
    -00157 {
    -00158     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
    -00159 }
    -00160 
    -00161 
    -00162 template <class CharIterator, size_t N> inline
    -00163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
    -00164 {
    -00165     assert(placeholder[N - 1] == 0);
    -00166     const ptrdiff_t strLen = N - 1; //don't count null-terminator
    -00167     if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
    -00168     {
    -00169         first += strLen - 1;
    -00170         return true;
    -00171     }
    -00172     return false;
    -00173 }
    -00174 
    -00175 
    -00176 namespace
    -00177 {
    -00178 std::string denormalize(const std::string& str)
    -00179 {
    -00180     std::string output;
    -00181     for (auto it = str.begin(); it != str.end(); ++it)
    -00182     {
    -00183         const char c = *it;
    -00184 
    -00185         if (c == '&')
    -00186         {
    -00187             if (checkEntity(it, str.end(), "&amp;"))
    -00188                 output += '&';
    -00189             else if (checkEntity(it, str.end(), "&lt;"))
    -00190                 output += '<';
    -00191             else if (checkEntity(it, str.end(), "&gt;"))
    -00192                 output += '>';
    -00193             else if (checkEntity(it, str.end(), "&apos;"))
    -00194                 output += '\'';
    -00195             else if (checkEntity(it, str.end(), "&quot;"))
    -00196                 output += '\"';
    -00197             else if (str.end() - it >= 6 &&
    -00198                      it[1] == '#' &&
    -00199                      it[2] == 'x' &&
    -00200                      it[5] == ';')
    -00201             {
    -00202                 output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
    -00203                 it += 5;
    -00204             }
    -00205             else
    -00206                 output += c; //unexpected char!
    -00207         }
    -00208         else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
    -00209         {
    -00210             auto itNext = it + 1;
    -00211             if (itNext != str.end() && *itNext == '\n')
    -00212                 ++it;
    -00213             output += '\n';
    -00214         }
    -00215         else
    -00216             output += c;
    -00217     };
    -00218     return output;
    -00219 }
    -00220 
    -00221 
    -00222 void serialize(const XmlElement& element, std::string& stream,
    -00223                const std::string& lineBreak,
    -00224                const std::string& indent,
    -00225                size_t indentLevel)
    -00226 {
    -00227     const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
    -00228 
    -00229     for (size_t i = 0; i < indentLevel; ++i)
    -00230         stream += indent;
    -00231 
    -00232     stream += '<' + nameFmt;
    -00233 
    -00234     auto attr = element.getAttributes();
    -00235     for (auto it = attr.first; it != attr.second; ++it)
    -00236         stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
    -00237 
    -00238     //no support for mixed-mode content
    -00239     auto iterPair = element.getChildren();
    -00240     if (iterPair.first != iterPair.second) //structured element
    -00241     {
    -00242         stream += '>' + lineBreak;
    -00243 
    -00244         std::for_each(iterPair.first, iterPair.second,
    -00245         [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
    -00246 
    -00247         for (size_t i = 0; i < indentLevel; ++i)
    -00248             stream += indent;
    -00249         stream += "</" + nameFmt + '>' + lineBreak;
    -00250     }
    -00251     else
    -00252     {
    -00253         std::string value;
    -00254         element.getValue(value);
    -00255 
    -00256         if (!value.empty()) //value element
    -00257             stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
    -00258         else //empty element
    -00259             stream += "/>" + lineBreak;
    -00260     }
    -00261 }
    -00262 
    -00263 std::string serialize(const XmlDoc& doc,
    -00264                       const std::string& lineBreak,
    -00265                       const std::string& indent)
    -00266 {
    -00267     std::string version = doc.getVersionAs<std::string>();
    -00268     if (!version.empty())
    -00269         version = " version=\"" + normalizeAttribValue(version) + '\"';
    -00270 
    -00271     std::string encoding = doc.getEncodingAs<std::string>();
    -00272     if (!encoding.empty())
    -00273         encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
    -00274 
    -00275     std::string standalone = doc.getStandaloneAs<std::string>();
    -00276     if (!standalone.empty())
    -00277         standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
    -00278 
    -00279     std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
    -00280     serialize(doc.root(), output, lineBreak, indent, 0);
    -00281     return output;
    -00282 }
    -00283 }
    -00284 }
    -00285 
    -00286 inline
    -00287 std::string serialize(const XmlDoc& doc,
    -00288                       const std::string& lineBreak,
    -00289                       const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
    -00290 
    -00291 /*
    -00292 Grammar for XML parser
    -00293 -------------------------------
    -00294 document-expression:
    -00295     <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
    -00296     element-expression:
    -00297 
    -00298 element-expression:
    -00299     <string attributes-expression/>
    -00300     <string attributes-expression> pm-expression </string>
    -00301 
    -00302 element-list-expression:
    -00303     <empty>
    -00304     element-expression element-list-expression
    -00305 
    -00306 attributes-expression:
    -00307     <empty>
    -00308     string="string" attributes-expression
    -00309 
    -00310 pm-expression:
    -00311     string
    -00312     element-list-expression
    -00313 */
    -00314 
    -00315 namespace implementation
    -00316 {
    -00317 struct Token
    -00318 {
    -00319     enum Type
    -00320     {
    -00321         TK_LESS,
    -00322         TK_GREATER,
    -00323         TK_LESS_SLASH,
    -00324         TK_SLASH_GREATER,
    -00325         TK_EQUAL,
    -00326         TK_QUOTE,
    -00327         TK_DECL_BEGIN,
    -00328         TK_DECL_END,
    -00329         TK_NAME,
    -00330         TK_END
    -00331     };
    -00332 
    -00333     Token(Type t) : type(t) {}
    -00334     Token(const std::string& txt) : type(TK_NAME), name(txt) {}
    -00335 
    -00336     Type type;
    -00337     std::string name; //filled if type == TK_NAME
    -00338 };
    -00339 
    -00340 class Scanner
    -00341 {
    -00342 public:
    -00343     Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
    -00344     {
    -00345         if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
    -00346             pos += strLength(BYTE_ORDER_MARK_UTF8);
    -00347 
    -00348         tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
    -00349         tokens.push_back(std::make_pair("?>",    Token::TK_DECL_END));
    -00350         tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
    -00351         tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
    -00352         tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
    -00353         tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
    -00354         tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
    -00355         tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
    -00356         tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
    -00357     }
    -00358 
    -00359     Token nextToken() //throw XmlParsingError
    -00360     {
    -00361         //skip whitespace
    -00362         pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
    -00363 
    -00364         if (pos == stream_.end())
    -00365             return Token::TK_END;
    -00366 
    -00367         for (auto it = tokens.begin(); it != tokens.end(); ++it)
    -00368             if (startsWith(pos, it->first))
    -00369             {
    -00370                 pos += it->first.size();
    -00371                 return it->second;
    -00372             }
    -00373 
    -00374         auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
    -00375         {
    -00376             return c == '<'  ||
    -00377                    c == '>'  ||
    -00378                    c == '='  ||
    -00379                    c == '/'  ||
    -00380                    c == '\'' ||
    -00381                    c == '\"' ||
    -00382                    zen::isWhiteSpace(c);
    -00383         });
    -00384 
    -00385         if (nameEnd != pos)
    -00386         {
    -00387             std::string name(&*pos, nameEnd - pos);
    -00388             pos = nameEnd;
    -00389             return implementation::denormalize(name);
    -00390         }
    -00391 
    -00392         //unknown token
    -00393         throw XmlParsingError(posRow(), posCol());
    -00394     }
    -00395 
    -00396     std::string extractElementValue()
    -00397     {
    -00398         auto it = std::find_if(pos, stream_.end(), [](char c)
    -00399         {
    -00400             return c == '<'  ||
    -00401                    c == '>';
    -00402         });
    -00403         std::string output(pos, it);
    -00404         pos = it;
    -00405         return implementation::denormalize(output);
    -00406     }
    -00407 
    -00408     std::string extractAttributeValue()
    -00409     {
    -00410         auto it = std::find_if(pos, stream_.end(), [](char c)
    -00411         {
    -00412             return c == '<'  ||
    -00413                    c == '>'  ||
    -00414                    c == '\'' ||
    -00415                    c == '\"';
    -00416         });
    -00417         std::string output(pos, it);
    -00418         pos = it;
    -00419         return implementation::denormalize(output);
    -00420     }
    -00421 
    -00422     size_t posRow() const //current row beginning with 0
    -00423     {
    -00424         const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
    -00425         const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
    -00426         assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
    -00427         return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
    -00428     }
    -00429 
    -00430     size_t posCol() const //current col beginning with 0
    -00431     {
    -00432         //seek beginning of line
    -00433         for (auto it = pos; it != stream_.begin(); )
    -00434         {
    -00435             --it;
    -00436             if (*it == '\r' || *it == '\n')
    -00437                 return pos - it - 1;
    -00438         }
    -00439         return pos - stream_.begin();
    -00440     }
    -00441 
    -00442 private:
    -00443     Scanner(const Scanner&);
    -00444     Scanner& operator=(const Scanner&);
    -00445 
    -00446     bool startsWith(std::string::const_iterator it, const std::string& prefix) const
    -00447     {
    -00448         if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size()))
    -00449             return false;
    -00450         return std::equal(prefix.begin(), prefix.end(), it);
    -00451     }
    -00452 
    -00453     typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
    -00454     TokenList tokens;
    -00455 
    -00456     const std::string stream_;
    -00457     std::string::const_iterator pos;
    -00458 };
    -00459 
    -00460 
    -00461 class XmlParser
    -00462 {
    -00463 public:
    -00464     XmlParser(const std::string& stream) :
    -00465         scn(stream),
    -00466         tk(scn.nextToken()) {}
    -00467 
    -00468     void parse(XmlDoc& doc) //throw XmlParsingError
    -00469     {
    -00470         //declaration (optional)
    -00471         if (token().type == Token::TK_DECL_BEGIN)
    -00472         {
    -00473             nextToken();
    -00474 
    -00475             while (token().type == Token::TK_NAME)
    -00476             {
    -00477                 std::string attribName = token().name;
    -00478                 nextToken();
    -00479 
    -00480                 consumeToken(Token::TK_EQUAL);
    -00481                 expectToken(Token::TK_QUOTE);
    -00482                 std::string attribValue = scn.extractAttributeValue();
    -00483                 nextToken();
    -00484 
    -00485                 consumeToken(Token::TK_QUOTE);
    -00486 
    -00487                 if (attribName == "version")
    -00488                     doc.setVersion(attribValue);
    -00489                 else if (attribName == "encoding")
    -00490                     doc.setEncoding(attribValue);
    -00491                 else if (attribName == "standalone")
    -00492                     doc.setStandalone(attribValue);
    -00493             }
    -00494             consumeToken(Token::TK_DECL_END);
    -00495         }
    -00496 
    -00497         XmlDoc dummy;
    -00498         XmlElement& elemTmp = dummy.root();
    -00499         parseChildElements(elemTmp);
    -00500 
    -00501         auto iterPair = elemTmp.getChildren();
    -00502         if (iterPair.first != iterPair.second)
    -00503             doc.root().swap(*iterPair.first);
    -00504 
    -00505         expectToken(Token::TK_END);
    -00506     };
    -00507 
    -00508 private:
    -00509     XmlParser(const XmlParser&);
    -00510     XmlParser& operator=(const XmlParser&);
    -00511 
    -00512     void parseChildElements(XmlElement& parent)
    -00513     {
    -00514         while (token().type == Token::TK_LESS)
    -00515         {
    -00516             nextToken();
    -00517 
    -00518             expectToken(Token::TK_NAME);
    -00519             std::string elementName = token().name;
    -00520             nextToken();
    -00521 
    -00522             XmlElement& newElement = parent.addChild(elementName);
    -00523 
    -00524             parseAttributes(newElement);
    -00525 
    -00526             if (token().type == Token::TK_SLASH_GREATER) //empty element
    -00527             {
    -00528                 nextToken();
    -00529                 continue;
    -00530             }
    -00531 
    -00532             expectToken(Token::TK_GREATER);
    -00533             std::string elementValue = scn.extractElementValue();
    -00534             nextToken();
    -00535 
    -00536             //no support for mixed-mode content
    -00537             if (token().type == Token::TK_LESS) //structured element
    -00538                 parseChildElements(newElement);
    -00539             else //value element
    -00540                 newElement.setValue(elementValue);
    -00541 
    -00542             consumeToken(Token::TK_LESS_SLASH);
    -00543 
    -00544             if (token().type != Token::TK_NAME ||
    -00545                 elementName != token().name)
    -00546                 throw XmlParsingError(scn.posRow(), scn.posCol());
    -00547             nextToken();
    -00548 
    -00549             consumeToken(Token::TK_GREATER);
    -00550         }
    -00551     };
    -00552 
    -00553     void parseAttributes(XmlElement& element)
    -00554     {
    -00555         while (token().type == Token::TK_NAME)
    -00556         {
    -00557             std::string attribName = token().name;
    -00558             nextToken();
    -00559 
    -00560             consumeToken(Token::TK_EQUAL);
    -00561             expectToken(Token::TK_QUOTE);
    -00562             std::string attribValue = scn.extractAttributeValue();
    -00563             nextToken();
    -00564 
    -00565             consumeToken(Token::TK_QUOTE);
    -00566             element.setAttribute(attribName, attribValue);
    -00567         }
    -00568     }
    -00569 
    -00570     const Token& token() const { return tk; }
    -00571     void nextToken() { tk = scn.nextToken(); }
    -00572 
    -00573     void consumeToken(Token::Type t) //throw XmlParsingError
    -00574     {
    -00575         expectToken(t); //throw XmlParsingError
    -00576         nextToken();
    -00577     }
    -00578 
    -00579     void expectToken(Token::Type t) //throw XmlParsingError
    -00580     {
    -00581         if (token().type != t)
    -00582             throw XmlParsingError(scn.posRow(), scn.posCol());
    -00583     }
    -00584 
    -00585     Scanner scn;
    -00586     Token tk;
    -00587 };
    -00588 }
    -00589 
    -00590 inline
    -00591 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError
    -00592 {
    -00593     implementation::XmlParser(stream).parse(doc);  //throw XmlParsingError
    -00594 }
    -00595 }
    -00596 
    -00597 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
    -
    - - +
    1 // **************************************************************************
    +
    2 // * This file is part of the FreeFileSync project. It is distributed under *
    +
    3 // * GNU General Public License: http://www.gnu.org/licenses/gpl.html *
    +
    4 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
    +
    5 // **************************************************************************
    +
    6 
    +
    7 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
    +
    8 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
    +
    9 
    +
    10 #include <cstdio>
    +
    11 #include <cstddef> //ptrdiff_t; req. on Linux
    +
    12 #include <zen/string_traits.h>
    +
    13 #include "dom.h"
    +
    14 #include "error.h"
    +
    15 
    +
    16 namespace zen
    +
    17 {
    +
    23 
    +
    30 std::string serialize(const XmlDoc& doc,
    +
    31  const std::string& lineBreak = "\r\n",
    +
    32  const std::string& indent = " "); //throw ()
    +
    33 
    +
    35 struct XmlParsingError : public XmlError
    +
    36 {
    +
    37  XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
    +
    39  size_t row; //beginning with 0
    +
    41  size_t col; //
    +
    42 };
    +
    43 
    +
    44 
    +
    46 
    +
    51 XmlDoc parse(const std::string& stream); //throw XmlParsingError
    +
    52 
    +
    53 
    +
    54 
    +
    55 
    +
    56 
    +
    57 
    +
    58 
    +
    59 
    +
    60 
    +
    61 
    +
    62 
    +
    63 
    +
    64 
    +
    65 
    +
    66 
    +
    67 
    +
    68 
    +
    69 
    +
    70 
    +
    71 
    +
    72 //---------------------------- implementation ----------------------------
    +
    73 //see: http://www.w3.org/TR/xml/
    +
    74 
    +
    75 namespace implementation
    +
    76 {
    +
    77 inline
    +
    78 std::pair<char, char> hexify(unsigned char c)
    +
    79 {
    +
    80  auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
    +
    81  {
    +
    82  assert(0 <= num&& num <= 15); //guaranteed by design below!
    +
    83  return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
    +
    84  '0' + num :
    +
    85  'A' + (num - 10));
    +
    86  };
    +
    87  return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
    +
    88 }
    +
    89 
    +
    90 
    +
    91 inline
    +
    92 char unhexify(char high, char low)
    +
    93 {
    +
    94  auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
    +
    95  {
    +
    96  if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
    +
    97  return hex - '0';
    +
    98  else if ('A' <= hex && hex <= 'F')
    +
    99  return (hex - 'A') + 10;
    +
    100  else if ('a' <= hex && hex <= 'f')
    +
    101  return (hex - 'a') + 10;
    +
    102  assert(false);
    +
    103  return 0;
    +
    104  };
    +
    105  return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
    +
    106 };
    +
    107 
    +
    108 
    +
    109 template <class Predicate> inline
    +
    110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
    +
    111 {
    +
    112  std::string output;
    +
    113  std::for_each(str.begin(), str.end(),
    +
    114  [&](char c)
    +
    115  {
    +
    116  if (c == '&') //
    +
    117  output += "&amp;";
    +
    118  else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
    +
    119  output += "&lt;";
    +
    120  else if (c == '>') //
    +
    121  output += "&gt;";
    +
    122  else if (pred(c))
    +
    123  {
    +
    124  if (c == '\'')
    +
    125  output += "&apos;";
    +
    126  else if (c == '\"')
    +
    127  output += "&quot;";
    +
    128  else
    +
    129  {
    +
    130  output += "&#x";
    +
    131  const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
    +
    132  output += hexDigits.first;
    +
    133  output += hexDigits.second;
    +
    134  output += ';';
    +
    135  }
    +
    136  }
    +
    137  else
    +
    138  output += c;
    +
    139  });
    +
    140  return output;
    +
    141 }
    +
    142 
    +
    143 inline
    +
    144 std::string normalizeName(const std::string& str)
    +
    145 {
    +
    146  return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
    +
    147 }
    +
    148 
    +
    149 inline
    +
    150 std::string normalizeElementValue(const std::string& str)
    +
    151 {
    +
    152  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
    +
    153 }
    +
    154 
    +
    155 inline
    +
    156 std::string normalizeAttribValue(const std::string& str)
    +
    157 {
    +
    158  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
    +
    159 }
    +
    160 
    +
    161 
    +
    162 template <class CharIterator, size_t N> inline
    +
    163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
    +
    164 {
    +
    165  assert(placeholder[N - 1] == 0);
    +
    166  const ptrdiff_t strLen = N - 1; //don't count null-terminator
    +
    167  if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
    +
    168  {
    +
    169  first += strLen - 1;
    +
    170  return true;
    +
    171  }
    +
    172  return false;
    +
    173 }
    +
    174 
    +
    175 
    +
    176 namespace
    +
    177 {
    +
    178 std::string denormalize(const std::string& str)
    +
    179 {
    +
    180  std::string output;
    +
    181  for (auto it = str.begin(); it != str.end(); ++it)
    +
    182  {
    +
    183  const char c = *it;
    +
    184 
    +
    185  if (c == '&')
    +
    186  {
    +
    187  if (checkEntity(it, str.end(), "&amp;"))
    +
    188  output += '&';
    +
    189  else if (checkEntity(it, str.end(), "&lt;"))
    +
    190  output += '<';
    +
    191  else if (checkEntity(it, str.end(), "&gt;"))
    +
    192  output += '>';
    +
    193  else if (checkEntity(it, str.end(), "&apos;"))
    +
    194  output += '\'';
    +
    195  else if (checkEntity(it, str.end(), "&quot;"))
    +
    196  output += '\"';
    +
    197  else if (str.end() - it >= 6 &&
    +
    198  it[1] == '#' &&
    +
    199  it[2] == 'x' &&
    +
    200  it[5] == ';')
    +
    201  {
    +
    202  output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
    +
    203  it += 5;
    +
    204  }
    +
    205  else
    +
    206  output += c; //unexpected char!
    +
    207  }
    +
    208  else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
    +
    209  {
    +
    210  auto itNext = it + 1;
    +
    211  if (itNext != str.end() && *itNext == '\n')
    +
    212  ++it;
    +
    213  output += '\n';
    +
    214  }
    +
    215  else
    +
    216  output += c;
    +
    217  };
    +
    218  return output;
    +
    219 }
    +
    220 
    +
    221 
    +
    222 void serialize(const XmlElement& element, std::string& stream,
    +
    223  const std::string& lineBreak,
    +
    224  const std::string& indent,
    +
    225  size_t indentLevel)
    +
    226 {
    +
    227  const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
    +
    228 
    +
    229  for (size_t i = 0; i < indentLevel; ++i)
    +
    230  stream += indent;
    +
    231 
    +
    232  stream += '<' + nameFmt;
    +
    233 
    +
    234  auto attr = element.getAttributes();
    +
    235  for (auto it = attr.first; it != attr.second; ++it)
    +
    236  stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
    +
    237 
    +
    238  //no support for mixed-mode content
    +
    239  auto iterPair = element.getChildren();
    +
    240  if (iterPair.first != iterPair.second) //structured element
    +
    241  {
    +
    242  stream += '>' + lineBreak;
    +
    243 
    +
    244  std::for_each(iterPair.first, iterPair.second,
    +
    245  [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
    +
    246 
    +
    247  for (size_t i = 0; i < indentLevel; ++i)
    +
    248  stream += indent;
    +
    249  stream += "</" + nameFmt + '>' + lineBreak;
    +
    250  }
    +
    251  else
    +
    252  {
    +
    253  std::string value;
    +
    254  element.getValue(value);
    +
    255 
    +
    256  if (!value.empty()) //value element
    +
    257  stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
    +
    258  else //empty element
    +
    259  stream += "/>" + lineBreak;
    +
    260  }
    +
    261 }
    +
    262 
    +
    263 std::string serialize(const XmlDoc& doc,
    +
    264  const std::string& lineBreak,
    +
    265  const std::string& indent)
    +
    266 {
    +
    267  std::string version = doc.getVersionAs<std::string>();
    +
    268  if (!version.empty())
    +
    269  version = " version=\"" + normalizeAttribValue(version) + '\"';
    +
    270 
    +
    271  std::string encoding = doc.getEncodingAs<std::string>();
    +
    272  if (!encoding.empty())
    +
    273  encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
    +
    274 
    +
    275  std::string standalone = doc.getStandaloneAs<std::string>();
    +
    276  if (!standalone.empty())
    +
    277  standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
    +
    278 
    +
    279  std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
    +
    280  serialize(doc.root(), output, lineBreak, indent, 0);
    +
    281  return output;
    +
    282 }
    +
    283 }
    +
    284 }
    +
    285 
    +
    286 inline
    +
    287 std::string serialize(const XmlDoc& doc,
    +
    288  const std::string& lineBreak,
    +
    289  const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
    +
    290 
    +
    291 /*
    +
    292 Grammar for XML parser
    +
    293 -------------------------------
    +
    294 document-expression:
    +
    295  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
    +
    296  element-expression:
    +
    297 
    +
    298 element-expression:
    +
    299  <string attributes-expression/>
    +
    300  <string attributes-expression> pm-expression </string>
    +
    301 
    +
    302 element-list-expression:
    +
    303  <empty>
    +
    304  element-expression element-list-expression
    +
    305 
    +
    306 attributes-expression:
    +
    307  <empty>
    +
    308  string="string" attributes-expression
    +
    309 
    +
    310 pm-expression:
    +
    311  string
    +
    312  element-list-expression
    +
    313 */
    +
    314 
    +
    315 namespace implementation
    +
    316 {
    +
    317 struct Token
    +
    318 {
    +
    319  enum Type
    +
    320  {
    +
    321  TK_LESS,
    +
    322  TK_GREATER,
    +
    323  TK_LESS_SLASH,
    +
    324  TK_SLASH_GREATER,
    +
    325  TK_EQUAL,
    +
    326  TK_QUOTE,
    +
    327  TK_DECL_BEGIN,
    +
    328  TK_DECL_END,
    +
    329  TK_NAME,
    +
    330  TK_END
    +
    331  };
    +
    332 
    +
    333  Token(Type t) : type(t) {}
    +
    334  Token(const std::string& txt) : type(TK_NAME), name(txt) {}
    +
    335 
    +
    336  Type type;
    +
    337  std::string name; //filled if type == TK_NAME
    +
    338 };
    +
    339 
    +
    340 class Scanner
    +
    341 {
    +
    342 public:
    +
    343  Scanner(const std::string& stream) :
    +
    344  xmlCommentBegin("<!--"),
    +
    345  xmlCommentEnd ("-->"),
    +
    346  stream_(stream),
    +
    347  pos(stream_.begin())
    +
    348  {
    +
    349  if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
    +
    350  pos += strLength(BYTE_ORDER_MARK_UTF8);
    +
    351 
    +
    352  tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
    +
    353  tokens.push_back(std::make_pair("?>", Token::TK_DECL_END));
    +
    354  tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
    +
    355  tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
    +
    356  tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
    +
    357  tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
    +
    358  tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
    +
    359  tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
    +
    360  tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
    +
    361  }
    +
    362 
    +
    363  Token nextToken() //throw XmlParsingError
    +
    364  {
    +
    365  //skip whitespace
    +
    366  pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
    +
    367 
    +
    368  if (pos == stream_.end())
    +
    369  return Token::TK_END;
    +
    370 
    +
    371  //skip XML comments
    +
    372  if (startsWith(xmlCommentBegin))
    +
    373  {
    +
    374  auto it = std::search(pos + xmlCommentBegin.size(), stream_.end(), xmlCommentEnd.begin(), xmlCommentEnd.end());
    +
    375  if (it != stream_.end())
    +
    376  {
    +
    377  pos = it + xmlCommentEnd.size();
    +
    378  return nextToken();
    +
    379  }
    +
    380  }
    +
    381 
    +
    382  for (auto it = tokens.begin(); it != tokens.end(); ++it)
    +
    383  if (startsWith(it->first))
    +
    384  {
    +
    385  pos += it->first.size();
    +
    386  return it->second;
    +
    387  }
    +
    388 
    +
    389  auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
    +
    390  {
    +
    391  return c == '<' ||
    +
    392  c == '>' ||
    +
    393  c == '=' ||
    +
    394  c == '/' ||
    +
    395  c == '\'' ||
    +
    396  c == '\"' ||
    +
    397  zen::isWhiteSpace(c);
    +
    398  });
    +
    399 
    +
    400  if (nameEnd != pos)
    +
    401  {
    +
    402  std::string name(&*pos, nameEnd - pos);
    +
    403  pos = nameEnd;
    +
    404  return implementation::denormalize(name);
    +
    405  }
    +
    406 
    +
    407  //unknown token
    +
    408  throw XmlParsingError(posRow(), posCol());
    +
    409  }
    +
    410 
    +
    411  std::string extractElementValue()
    +
    412  {
    +
    413  auto it = std::find_if(pos, stream_.end(), [](char c)
    +
    414  {
    +
    415  return c == '<' ||
    +
    416  c == '>';
    +
    417  });
    +
    418  std::string output(pos, it);
    +
    419  pos = it;
    +
    420  return implementation::denormalize(output);
    +
    421  }
    +
    422 
    +
    423  std::string extractAttributeValue()
    +
    424  {
    +
    425  auto it = std::find_if(pos, stream_.end(), [](char c)
    +
    426  {
    +
    427  return c == '<' ||
    +
    428  c == '>' ||
    +
    429  c == '\'' ||
    +
    430  c == '\"';
    +
    431  });
    +
    432  std::string output(pos, it);
    +
    433  pos = it;
    +
    434  return implementation::denormalize(output);
    +
    435  }
    +
    436 
    +
    437  size_t posRow() const //current row beginning with 0
    +
    438  {
    +
    439  const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
    +
    440  const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
    +
    441  assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
    +
    442  return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
    +
    443  }
    +
    444 
    +
    445  size_t posCol() const //current col beginning with 0
    +
    446  {
    +
    447  //seek beginning of line
    +
    448  for (auto it = pos; it != stream_.begin(); )
    +
    449  {
    +
    450  --it;
    +
    451  if (*it == '\r' || *it == '\n')
    +
    452  return pos - it - 1;
    +
    453  }
    +
    454  return pos - stream_.begin();
    +
    455  }
    +
    456 
    +
    457 private:
    +
    458  Scanner(const Scanner&);
    +
    459  Scanner& operator=(const Scanner&);
    +
    460 
    +
    461  bool startsWith(const std::string& prefix) const
    +
    462  {
    +
    463  if (stream_.end() - pos < static_cast<ptrdiff_t>(prefix.size()))
    +
    464  return false;
    +
    465  return std::equal(prefix.begin(), prefix.end(), pos);
    +
    466  }
    +
    467 
    +
    468  typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
    +
    469  TokenList tokens;
    +
    470 
    +
    471  const std::string xmlCommentBegin;
    +
    472  const std::string xmlCommentEnd;
    +
    473 
    +
    474  const std::string stream_;
    +
    475  std::string::const_iterator pos;
    +
    476 };
    +
    477 
    +
    478 
    +
    479 class XmlParser
    +
    480 {
    +
    481 public:
    +
    482  XmlParser(const std::string& stream) :
    +
    483  scn(stream),
    +
    484  tk(scn.nextToken()) {}
    +
    485 
    +
    486  XmlDoc parse() //throw XmlParsingError
    +
    487  {
    +
    488  XmlDoc doc;
    +
    489 
    +
    490  //declaration (optional)
    +
    491  if (token().type == Token::TK_DECL_BEGIN)
    +
    492  {
    +
    493  nextToken();
    +
    494 
    +
    495  while (token().type == Token::TK_NAME)
    +
    496  {
    +
    497  std::string attribName = token().name;
    +
    498  nextToken();
    +
    499 
    +
    500  consumeToken(Token::TK_EQUAL);
    +
    501  expectToken(Token::TK_QUOTE);
    +
    502  std::string attribValue = scn.extractAttributeValue();
    +
    503  nextToken();
    +
    504 
    +
    505  consumeToken(Token::TK_QUOTE);
    +
    506 
    +
    507  if (attribName == "version")
    +
    508  doc.setVersion(attribValue);
    +
    509  else if (attribName == "encoding")
    +
    510  doc.setEncoding(attribValue);
    +
    511  else if (attribName == "standalone")
    +
    512  doc.setStandalone(attribValue);
    +
    513  }
    +
    514  consumeToken(Token::TK_DECL_END);
    +
    515  }
    +
    516 
    +
    517  XmlDoc dummy;
    +
    518  XmlElement& elemTmp = dummy.root();
    +
    519  parseChildElements(elemTmp);
    +
    520 
    +
    521  auto iterPair = elemTmp.getChildren();
    +
    522  if (iterPair.first != iterPair.second)
    +
    523  doc.root().swap(*iterPair.first);
    +
    524 
    +
    525  expectToken(Token::TK_END);
    +
    526  return doc;
    +
    527  };
    +
    528 
    +
    529 private:
    +
    530  XmlParser(const XmlParser&);
    +
    531  XmlParser& operator=(const XmlParser&);
    +
    532 
    +
    533  void parseChildElements(XmlElement& parent)
    +
    534  {
    +
    535  while (token().type == Token::TK_LESS)
    +
    536  {
    +
    537  nextToken();
    +
    538 
    +
    539  expectToken(Token::TK_NAME);
    +
    540  std::string elementName = token().name;
    +
    541  nextToken();
    +
    542 
    +
    543  XmlElement& newElement = parent.addChild(elementName);
    +
    544 
    +
    545  parseAttributes(newElement);
    +
    546 
    +
    547  if (token().type == Token::TK_SLASH_GREATER) //empty element
    +
    548  {
    +
    549  nextToken();
    +
    550  continue;
    +
    551  }
    +
    552 
    +
    553  expectToken(Token::TK_GREATER);
    +
    554  std::string elementValue = scn.extractElementValue();
    +
    555  nextToken();
    +
    556 
    +
    557  //no support for mixed-mode content
    +
    558  if (token().type == Token::TK_LESS) //structured element
    +
    559  parseChildElements(newElement);
    +
    560  else //value element
    +
    561  newElement.setValue(elementValue);
    +
    562 
    +
    563  consumeToken(Token::TK_LESS_SLASH);
    +
    564 
    +
    565  if (token().type != Token::TK_NAME ||
    +
    566  elementName != token().name)
    +
    567  throw XmlParsingError(scn.posRow(), scn.posCol());
    +
    568  nextToken();
    +
    569 
    +
    570  consumeToken(Token::TK_GREATER);
    +
    571  }
    +
    572  };
    +
    573 
    +
    574  void parseAttributes(XmlElement& element)
    +
    575  {
    +
    576  while (token().type == Token::TK_NAME)
    +
    577  {
    +
    578  std::string attribName = token().name;
    +
    579  nextToken();
    +
    580 
    +
    581  consumeToken(Token::TK_EQUAL);
    +
    582  expectToken(Token::TK_QUOTE);
    +
    583  std::string attribValue = scn.extractAttributeValue();
    +
    584  nextToken();
    +
    585 
    +
    586  consumeToken(Token::TK_QUOTE);
    +
    587  element.setAttribute(attribName, attribValue);
    +
    588  }
    +
    589  }
    +
    590 
    +
    591  const Token& token() const { return tk; }
    +
    592  void nextToken() { tk = scn.nextToken(); }
    +
    593 
    +
    594  void consumeToken(Token::Type t) //throw XmlParsingError
    +
    595  {
    +
    596  expectToken(t); //throw XmlParsingError
    +
    597  nextToken();
    +
    598  }
    +
    599 
    +
    600  void expectToken(Token::Type t) //throw XmlParsingError
    +
    601  {
    +
    602  if (token().type != t)
    +
    603  throw XmlParsingError(scn.posRow(), scn.posCol());
    +
    604  }
    +
    605 
    +
    606  Scanner scn;
    +
    607  Token tk;
    +
    608 };
    +
    609 }
    +
    610 
    +
    611 inline
    +
    612 XmlDoc parse(const std::string& stream) //throw XmlParsingError
    +
    613 {
    +
    614  return implementation::XmlParser(stream).parse(); //throw XmlParsingError
    +
    615 }
    +
    616 }
    +
    617 
    +
    618 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
    +
    XmlDoc parse(const std::string &stream)
    Load XML document from a byte stream.
    Definition: parser.h:612
    +
    std::string serialize(const XmlDoc &doc, const std::string &lineBreak="\r\n", const std::string &indent=" ")
    Save XML document as a byte stream.
    Definition: parser.h:287
    +
    size_t row
    Input file row where the parsing error occured (zero-based)
    Definition: parser.h:39
    +
    The complete XML document.
    Definition: dom.h:249
    +
    size_t col
    Input file column where the parsing error occured (zero-based)
    Definition: parser.h:41
    +
    Exception thrown due to an XML parsing error.
    Definition: parser.h:35
    +
    Exception base class for zen::Xml.
    Definition: error.h:13
    +
    + - -- cgit