zenxml/doc/parser_8h_source.html

00001 // **************************************************************************
-00002 // * This file is part of the zen::Xml project. It is distributed under the *
-00003 // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt           *
-00004 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved        *
-00005 // **************************************************************************
-00006
-00007 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
-00008 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
-00009
-00010 #include <cstdio>
-00011 #include <cstddef> //ptrdiff_t; req. on Linux
-00012 #include <zen/string_traits.h>
-00013 #include "dom.h"
-00014 #include "error.h"
-00015
-00016 namespace zen
-00017 {
-00023
-00024
-00030 std::string serialize(const XmlDoc& doc,
-00031                       const std::string& lineBreak = "\r\n",
-00032                       const std::string& indent = "    "); //throw ()
-00033
-00035 struct XmlParsingError : public XmlError
-00036 {
-00037     XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
-00039     size_t row; //beginning with 0
-00041     size_t col; //
-00042 };
-00043
-00044
-00046
-00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError
-00052
-00053
-00054
-00055
-00056
-00057
-00058
-00059
-00060
-00061
-00062
-00063
-00064
-00065
-00066
-00067
-00068
-00069
-00070
-00071
-00072 //---------------------------- implementation ----------------------------
-00073 //see: http://www.w3.org/TR/xml/
-00074
-00075 namespace implementation
-00076 {
-00077 inline
-00078 std::pair<char, char> hexify(unsigned char c)
-00079 {
-00080     auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
-00081     {
-00082         assert(0 <= num&&  num <= 15); //guaranteed by design below!
-00083         return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
-00084         '0' + num :
-00085         'A' + (num - 10));
-00086     };
-00087     return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
-00088 }
-00089
-00090
-00091 inline
-00092 char unhexify(char high, char low)
-00093 {
-00094     auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
-00095     {
-00096         if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
-00097             return hex - '0';
-00098         else if ('A' <= hex && hex <= 'F')
-00099             return (hex - 'A') + 10;
-00100         else if ('a' <= hex && hex <= 'f')
-00101             return (hex - 'a') + 10;
-00102         assert(false);
-00103         return 0;
-00104     };
-00105     return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
-00106 };
-00107
-00108
-00109 template <class Predicate> inline
-00110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
-00111 {
-00112     std::string output;
-00113     std::for_each(str.begin(), str.end(),
-00114                   [&](char c)
-00115     {
-00116         if (c == '&')      //
-00117             output += "&amp;";
-00118         else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
-00119             output += "&lt;";
-00120         else if (c == '>') //
-00121             output += "&gt;";
-00122         else if (pred(c))
-00123         {
-00124             if (c == '\'')
-00125                 output += "&apos;";
-00126             else if (c == '\"')
-00127                 output += "&quot;";
-00128             else
-00129             {
-00130                 output += "&#x";
-00131                 const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
-00132                 output += hexDigits.first;
-00133                 output += hexDigits.second;
-00134                 output += ';';
-00135             }
-00136         }
-00137         else
-00138             output += c;
-00139     });
-00140     return output;
-00141 }
-00142
-00143 inline
-00144 std::string normalizeName(const std::string& str)
-00145 {
-00146     return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
-00147 }
-00148
-00149 inline
-00150 std::string normalizeElementValue(const std::string& str)
-00151 {
-00152     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
-00153 }
-00154
-00155 inline
-00156 std::string normalizeAttribValue(const std::string& str)
-00157 {
-00158     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
-00159 }
-00160
-00161
-00162 template <class CharIterator, size_t N> inline
-00163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
-00164 {
-00165     assert(placeholder[N - 1] == 0);
-00166     const ptrdiff_t strLen = N - 1; //don't count null-terminator
-00167     if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
-00168     {
-00169         first += strLen - 1;
-00170         return true;
-00171     }
-00172     return false;
-00173 }
-00174
-00175
-00176 namespace
-00177 {
-00178 std::string denormalize(const std::string& str)
-00179 {
-00180     std::string output;
-00181     for (auto it = str.begin(); it != str.end(); ++it)
-00182     {
-00183         const char c = *it;
-00184
-00185         if (c == '&')
-00186         {
-00187             if (checkEntity(it, str.end(), "&amp;"))
-00188                 output += '&';
-00189             else if (checkEntity(it, str.end(), "&lt;"))
-00190                 output += '<';
-00191             else if (checkEntity(it, str.end(), "&gt;"))
-00192                 output += '>';
-00193             else if (checkEntity(it, str.end(), "&apos;"))
-00194                 output += '\'';
-00195             else if (checkEntity(it, str.end(), "&quot;"))
-00196                 output += '\"';
-00197             else if (str.end() - it >= 6 &&
-00198                      it[1] == '#' &&
-00199                      it[2] == 'x' &&
-00200                      it[5] == ';')
-00201             {
-00202                 output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
-00203                 it += 5;
-00204             }
-00205             else
-00206                 output += c; //unexpected char!
-00207         }
-00208         else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
-00209         {
-00210             auto itNext = it + 1;
-00211             if (itNext != str.end() && *itNext == '\n')
-00212                 ++it;
-00213             output += '\n';
-00214         }
-00215         else
-00216             output += c;
-00217     };
-00218     return output;
-00219 }
-00220
-00221
-00222 void serialize(const XmlElement& element, std::string& stream,
-00223                const std::string& lineBreak,
-00224                const std::string& indent,
-00225                size_t indentLevel)
-00226 {
-00227     const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
-00228
-00229     for (size_t i = 0; i < indentLevel; ++i)
-00230         stream += indent;
-00231
-00232     stream += '<' + nameFmt;
-00233
-00234     auto attr = element.getAttributes();
-00235     for (auto it = attr.first; it != attr.second; ++it)
-00236         stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
-00237
-00238     //no support for mixed-mode content
-00239     auto iterPair = element.getChildren();
-00240     if (iterPair.first != iterPair.second) //structured element
-00241     {
-00242         stream += '>' + lineBreak;
-00243
-00244         std::for_each(iterPair.first, iterPair.second,
-00245         [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
-00246
-00247         for (size_t i = 0; i < indentLevel; ++i)
-00248             stream += indent;
-00249         stream += "</" + nameFmt + '>' + lineBreak;
-00250     }
-00251     else
-00252     {
-00253         std::string value;
-00254         element.getValue(value);
-00255
-00256         if (!value.empty()) //value element
-00257             stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
-00258         else //empty element
-00259             stream += "/>" + lineBreak;
-00260     }
-00261 }
-00262
-00263 std::string serialize(const XmlDoc& doc,
-00264                       const std::string& lineBreak,
-00265                       const std::string& indent)
-00266 {
-00267     std::string version = doc.getVersionAs<std::string>();
-00268     if (!version.empty())
-00269         version = " version=\"" + normalizeAttribValue(version) + '\"';
-00270
-00271     std::string encoding = doc.getEncodingAs<std::string>();
-00272     if (!encoding.empty())
-00273         encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
-00274
-00275     std::string standalone = doc.getStandaloneAs<std::string>();
-00276     if (!standalone.empty())
-00277         standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
-00278
-00279     std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
-00280     serialize(doc.root(), output, lineBreak, indent, 0);
-00281     return output;
-00282 }
-00283 }
-00284 }
-00285
-00286 inline
-00287 std::string serialize(const XmlDoc& doc,
-00288                       const std::string& lineBreak,
-00289                       const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
-00290
-00291 /*
-00292 Grammar for XML parser
-00293 -------------------------------
-00294 document-expression:
-00295     <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-00296     element-expression:
-00297
-00298 element-expression:
-00299     <string attributes-expression/>
-00300     <string attributes-expression> pm-expression </string>
-00301
-00302 element-list-expression:
-00303     <empty>
-00304     element-expression element-list-expression
-00305
-00306 attributes-expression:
-00307     <empty>
-00308     string="string" attributes-expression
-00309
-00310 pm-expression:
-00311     string
-00312     element-list-expression
-00313 */
-00314
-00315 namespace implementation
-00316 {
-00317 struct Token
-00318 {
-00319     enum Type
-00320     {
-00321         TK_LESS,
-00322         TK_GREATER,
-00323         TK_LESS_SLASH,
-00324         TK_SLASH_GREATER,
-00325         TK_EQUAL,
-00326         TK_QUOTE,
-00327         TK_DECL_BEGIN,
-00328         TK_DECL_END,
-00329         TK_NAME,
-00330         TK_END
-00331     };
-00332
-00333     Token(Type t) : type(t) {}
-00334     Token(const std::string& txt) : type(TK_NAME), name(txt) {}
-00335
-00336     Type type;
-00337     std::string name; //filled if type == TK_NAME
-00338 };
-00339
-00340 class Scanner
-00341 {
-00342 public:
-00343     Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
-00344     {
-00345         if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
-00346             pos += strLength(BYTE_ORDER_MARK_UTF8);
-00347
-00348         tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
-00349         tokens.push_back(std::make_pair("?>",    Token::TK_DECL_END));
-00350         tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
-00351         tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
-00352         tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
-00353         tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
-00354         tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
-00355         tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
-00356         tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
-00357     }
-00358
-00359     Token nextToken() //throw XmlParsingError
-00360     {
-00361         //skip whitespace
-00362         pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
-00363
-00364         if (pos == stream_.end())
-00365             return Token::TK_END;
-00366
-00367         for (auto it = tokens.begin(); it != tokens.end(); ++it)
-00368             if (startsWith(pos, it->first))
-00369             {
-00370                 pos += it->first.size();
-00371                 return it->second;
-00372             }
-00373
-00374         auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
-00375         {
-00376             return c == '<'  ||
-00377                    c == '>'  ||
-00378                    c == '='  ||
-00379                    c == '/'  ||
-00380                    c == '\'' ||
-00381                    c == '\"' ||
-00382                    zen::isWhiteSpace(c);
-00383         });
-00384
-00385         if (nameEnd != pos)
-00386         {
-00387             std::string name(&*pos, nameEnd - pos);
-00388             pos = nameEnd;
-00389             return implementation::denormalize(name);
-00390         }
-00391
-00392         //unknown token
-00393         throw XmlParsingError(posRow(), posCol());
-00394     }
-00395
-00396     std::string extractElementValue()
-00397     {
-00398         auto it = std::find_if(pos, stream_.end(), [](char c)
-00399         {
-00400             return c == '<'  ||
-00401                    c == '>';
-00402         });
-00403         std::string output(pos, it);
-00404         pos = it;
-00405         return implementation::denormalize(output);
-00406     }
-00407
-00408     std::string extractAttributeValue()
-00409     {
-00410         auto it = std::find_if(pos, stream_.end(), [](char c)
-00411         {
-00412             return c == '<'  ||
-00413                    c == '>'  ||
-00414                    c == '\'' ||
-00415                    c == '\"';
-00416         });
-00417         std::string output(pos, it);
-00418         pos = it;
-00419         return implementation::denormalize(output);
-00420     }
-00421
-00422     size_t posRow() const //current row beginning with 0
-00423     {
-00424         const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
-00425         const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
-00426         assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
-00427         return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
-00428     }
-00429
-00430     size_t posCol() const //current col beginning with 0
-00431     {
-00432         //seek beginning of line
-00433         for (auto it = pos; it != stream_.begin(); )
-00434         {
-00435             --it;
-00436             if (*it == '\r' || *it == '\n')
-00437                 return pos - it - 1;
-00438         }
-00439         return pos - stream_.begin();
-00440     }
-00441
-00442 private:
-00443     Scanner(const Scanner&);
-00444     Scanner& operator=(const Scanner&);
-00445
-00446     bool startsWith(std::string::const_iterator it, const std::string& prefix) const
-00447     {
-00448         if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size()))
-00449             return false;
-00450         return std::equal(prefix.begin(), prefix.end(), it);
-00451     }
-00452
-00453     typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
-00454     TokenList tokens;
-00455
-00456     const std::string stream_;
-00457     std::string::const_iterator pos;
-00458 };
-00459
-00460
-00461 class XmlParser
-00462 {
-00463 public:
-00464     XmlParser(const std::string& stream) :
-00465         scn(stream),
-00466         tk(scn.nextToken()) {}
-00467
-00468     void parse(XmlDoc& doc) //throw XmlParsingError
-00469     {
-00470         //declaration (optional)
-00471         if (token().type == Token::TK_DECL_BEGIN)
-00472         {
-00473             nextToken();
-00474
-00475             while (token().type == Token::TK_NAME)
-00476             {
-00477                 std::string attribName = token().name;
-00478                 nextToken();
-00479
-00480                 consumeToken(Token::TK_EQUAL);
-00481                 expectToken(Token::TK_QUOTE);
-00482                 std::string attribValue = scn.extractAttributeValue();
-00483                 nextToken();
-00484
-00485                 consumeToken(Token::TK_QUOTE);
-00486
-00487                 if (attribName == "version")
-00488                     doc.setVersion(attribValue);
-00489                 else if (attribName == "encoding")
-00490                     doc.setEncoding(attribValue);
-00491                 else if (attribName == "standalone")
-00492                     doc.setStandalone(attribValue);
-00493             }
-00494             consumeToken(Token::TK_DECL_END);
-00495         }
-00496
-00497         XmlDoc dummy;
-00498         XmlElement& elemTmp = dummy.root();
-00499         parseChildElements(elemTmp);
-00500
-00501         auto iterPair = elemTmp.getChildren();
-00502         if (iterPair.first != iterPair.second)
-00503             doc.root().swap(*iterPair.first);
-00504
-00505         expectToken(Token::TK_END);
-00506     };
-00507
-00508 private:
-00509     XmlParser(const XmlParser&);
-00510     XmlParser& operator=(const XmlParser&);
-00511
-00512     void parseChildElements(XmlElement& parent)
-00513     {
-00514         while (token().type == Token::TK_LESS)
-00515         {
-00516             nextToken();
-00517
-00518             expectToken(Token::TK_NAME);
-00519             std::string elementName = token().name;
-00520             nextToken();
-00521
-00522             XmlElement& newElement = parent.addChild(elementName);
-00523
-00524             parseAttributes(newElement);
-00525
-00526             if (token().type == Token::TK_SLASH_GREATER) //empty element
-00527             {
-00528                 nextToken();
-00529                 continue;
-00530             }
-00531
-00532             expectToken(Token::TK_GREATER);
-00533             std::string elementValue = scn.extractElementValue();
-00534             nextToken();
-00535
-00536             //no support for mixed-mode content
-00537             if (token().type == Token::TK_LESS) //structured element
-00538                 parseChildElements(newElement);
-00539             else //value element
-00540                 newElement.setValue(elementValue);
-00541
-00542             consumeToken(Token::TK_LESS_SLASH);
-00543
-00544             if (token().type != Token::TK_NAME ||
-00545                 elementName != token().name)
-00546                 throw XmlParsingError(scn.posRow(), scn.posCol());
-00547             nextToken();
-00548
-00549             consumeToken(Token::TK_GREATER);
-00550         }
-00551     };
-00552
-00553     void parseAttributes(XmlElement& element)
-00554     {
-00555         while (token().type == Token::TK_NAME)
-00556         {
-00557             std::string attribName = token().name;
-00558             nextToken();
-00559
-00560             consumeToken(Token::TK_EQUAL);
-00561             expectToken(Token::TK_QUOTE);
-00562             std::string attribValue = scn.extractAttributeValue();
-00563             nextToken();
-00564
-00565             consumeToken(Token::TK_QUOTE);
-00566             element.setAttribute(attribName, attribValue);
-00567         }
-00568     }
-00569
-00570     const Token& token() const { return tk; }
-00571     void nextToken() { tk = scn.nextToken(); }
-00572
-00573     void consumeToken(Token::Type t) //throw XmlParsingError
-00574     {
-00575         expectToken(t); //throw XmlParsingError
-00576         nextToken();
-00577     }
-00578
-00579     void expectToken(Token::Type t) //throw XmlParsingError
-00580     {
-00581         if (token().type != t)
-00582             throw XmlParsingError(scn.posRow(), scn.posCol());
-00583     }
-00584
-00585     Scanner scn;
-00586     Token tk;
-00587 };
-00588 }
-00589
-00590 inline
-00591 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError
-00592 {
-00593     implementation::XmlParser(stream).parse(doc);  //throw XmlParsingError
-00594 }
-00595 }
-00596
-00597 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
-