zen::Xml
Simple C++ XML Processing
 All Classes Namespaces Functions Variables
parser.h
00001 // **************************************************************************
00002 // * This file is part of the zen::Xml project. It is distributed under the *
00003 // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt           *
00004 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved        *
00005 // **************************************************************************
00006 
00007 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
00008 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
00009 
00010 #include <cstdio>
00011 #include <cstddef> //ptrdiff_t; req. on Linux
00012 #include <zen/string_traits.h>
00013 #include "dom.h"
00014 #include "error.h"
00015 
00016 namespace zen
00017 {
00023 
00024 
00030 std::string serialize(const XmlDoc& doc,
00031                       const std::string& lineBreak = "\r\n",
00032                       const std::string& indent = "    "); //throw ()
00033 
00035 struct XmlParsingError : public XmlError
00036 {
00037     XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
00039     size_t row; //beginning with 0
00041     size_t col; //
00042 };
00043 
00044 
00046 
00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 
00068 
00069 
00070 
00071 
00072 //---------------------------- implementation ----------------------------
00073 //see: http://www.w3.org/TR/xml/
00074 
00075 namespace implementation
00076 {
00077 inline
00078 std::pair<char, char> hexify(unsigned char c)
00079 {
00080     auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
00081     {
00082         assert(0 <= num&&  num <= 15); //guaranteed by design below!
00083         return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
00084         '0' + num :
00085         'A' + (num - 10));
00086     };
00087     return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
00088 }
00089 
00090 
00091 inline
00092 char unhexify(char high, char low)
00093 {
00094     auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
00095     {
00096         if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
00097             return hex - '0';
00098         else if ('A' <= hex && hex <= 'F')
00099             return (hex - 'A') + 10;
00100         else if ('a' <= hex && hex <= 'f')
00101             return (hex - 'a') + 10;
00102         assert(false);
00103         return 0;
00104     };
00105     return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
00106 };
00107 
00108 
00109 template <class Predicate> inline
00110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
00111 {
00112     std::string output;
00113     std::for_each(str.begin(), str.end(),
00114                   [&](char c)
00115     {
00116         if (c == '&')      //
00117             output += "&amp;";
00118         else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
00119             output += "&lt;";
00120         else if (c == '>') //
00121             output += "&gt;";
00122         else if (pred(c))
00123         {
00124             if (c == '\'')
00125                 output += "&apos;";
00126             else if (c == '\"')
00127                 output += "&quot;";
00128             else
00129             {
00130                 output += "&#x";
00131                 const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
00132                 output += hexDigits.first;
00133                 output += hexDigits.second;
00134                 output += ';';
00135             }
00136         }
00137         else
00138             output += c;
00139     });
00140     return output;
00141 }
00142 
00143 inline
00144 std::string normalizeName(const std::string& str)
00145 {
00146     return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
00147 }
00148 
00149 inline
00150 std::string normalizeElementValue(const std::string& str)
00151 {
00152     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
00153 }
00154 
00155 inline
00156 std::string normalizeAttribValue(const std::string& str)
00157 {
00158     return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
00159 }
00160 
00161 
00162 template <class CharIterator, size_t N> inline
00163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
00164 {
00165     assert(placeholder[N - 1] == 0);
00166     const ptrdiff_t strLen = N - 1; //don't count null-terminator
00167     if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
00168     {
00169         first += strLen - 1;
00170         return true;
00171     }
00172     return false;
00173 }
00174 
00175 
00176 namespace
00177 {
00178 std::string denormalize(const std::string& str)
00179 {
00180     std::string output;
00181     for (auto it = str.begin(); it != str.end(); ++it)
00182     {
00183         const char c = *it;
00184 
00185         if (c == '&')
00186         {
00187             if (checkEntity(it, str.end(), "&amp;"))
00188                 output += '&';
00189             else if (checkEntity(it, str.end(), "&lt;"))
00190                 output += '<';
00191             else if (checkEntity(it, str.end(), "&gt;"))
00192                 output += '>';
00193             else if (checkEntity(it, str.end(), "&apos;"))
00194                 output += '\'';
00195             else if (checkEntity(it, str.end(), "&quot;"))
00196                 output += '\"';
00197             else if (str.end() - it >= 6 &&
00198                      it[1] == '#' &&
00199                      it[2] == 'x' &&
00200                      it[5] == ';')
00201             {
00202                 output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
00203                 it += 5;
00204             }
00205             else
00206                 output += c; //unexpected char!
00207         }
00208         else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
00209         {
00210             auto itNext = it + 1;
00211             if (itNext != str.end() && *itNext == '\n')
00212                 ++it;
00213             output += '\n';
00214         }
00215         else
00216             output += c;
00217     };
00218     return output;
00219 }
00220 
00221 
00222 void serialize(const XmlElement& element, std::string& stream,
00223                const std::string& lineBreak,
00224                const std::string& indent,
00225                size_t indentLevel)
00226 {
00227     const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
00228 
00229     for (size_t i = 0; i < indentLevel; ++i)
00230         stream += indent;
00231 
00232     stream += '<' + nameFmt;
00233 
00234     auto attr = element.getAttributes();
00235     for (auto it = attr.first; it != attr.second; ++it)
00236         stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
00237 
00238     //no support for mixed-mode content
00239     auto iterPair = element.getChildren();
00240     if (iterPair.first != iterPair.second) //structured element
00241     {
00242         stream += '>' + lineBreak;
00243 
00244         std::for_each(iterPair.first, iterPair.second,
00245         [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
00246 
00247         for (size_t i = 0; i < indentLevel; ++i)
00248             stream += indent;
00249         stream += "</" + nameFmt + '>' + lineBreak;
00250     }
00251     else
00252     {
00253         std::string value;
00254         element.getValue(value);
00255 
00256         if (!value.empty()) //value element
00257             stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
00258         else //empty element
00259             stream += "/>" + lineBreak;
00260     }
00261 }
00262 
00263 std::string serialize(const XmlDoc& doc,
00264                       const std::string& lineBreak,
00265                       const std::string& indent)
00266 {
00267     std::string version = doc.getVersionAs<std::string>();
00268     if (!version.empty())
00269         version = " version=\"" + normalizeAttribValue(version) + '\"';
00270 
00271     std::string encoding = doc.getEncodingAs<std::string>();
00272     if (!encoding.empty())
00273         encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
00274 
00275     std::string standalone = doc.getStandaloneAs<std::string>();
00276     if (!standalone.empty())
00277         standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
00278 
00279     std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
00280     serialize(doc.root(), output, lineBreak, indent, 0);
00281     return output;
00282 }
00283 }
00284 }
00285 
00286 inline
00287 std::string serialize(const XmlDoc& doc,
00288                       const std::string& lineBreak,
00289                       const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
00290 
00291 /*
00292 Grammar for XML parser
00293 -------------------------------
00294 document-expression:
00295     <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
00296     element-expression:
00297 
00298 element-expression:
00299     <string attributes-expression/>
00300     <string attributes-expression> pm-expression </string>
00301 
00302 element-list-expression:
00303     <empty>
00304     element-expression element-list-expression
00305 
00306 attributes-expression:
00307     <empty>
00308     string="string" attributes-expression
00309 
00310 pm-expression:
00311     string
00312     element-list-expression
00313 */
00314 
00315 namespace implementation
00316 {
00317 struct Token
00318 {
00319     enum Type
00320     {
00321         TK_LESS,
00322         TK_GREATER,
00323         TK_LESS_SLASH,
00324         TK_SLASH_GREATER,
00325         TK_EQUAL,
00326         TK_QUOTE,
00327         TK_DECL_BEGIN,
00328         TK_DECL_END,
00329         TK_NAME,
00330         TK_END
00331     };
00332 
00333     Token(Type t) : type(t) {}
00334     Token(const std::string& txt) : type(TK_NAME), name(txt) {}
00335 
00336     Type type;
00337     std::string name; //filled if type == TK_NAME
00338 };
00339 
00340 class Scanner
00341 {
00342 public:
00343     Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
00344     {
00345         if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
00346             pos += strLength(BYTE_ORDER_MARK_UTF8);
00347 
00348         tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
00349         tokens.push_back(std::make_pair("?>",    Token::TK_DECL_END));
00350         tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
00351         tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
00352         tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
00353         tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
00354         tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
00355         tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
00356         tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
00357     }
00358 
00359     Token nextToken() //throw XmlParsingError
00360     {
00361         //skip whitespace
00362         pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
00363 
00364         if (pos == stream_.end())
00365             return Token::TK_END;
00366 
00367         for (auto it = tokens.begin(); it != tokens.end(); ++it)
00368             if (startsWith(pos, it->first))
00369             {
00370                 pos += it->first.size();
00371                 return it->second;
00372             }
00373 
00374         auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
00375         {
00376             return c == '<'  ||
00377                    c == '>'  ||
00378                    c == '='  ||
00379                    c == '/'  ||
00380                    c == '\'' ||
00381                    c == '\"' ||
00382                    zen::isWhiteSpace(c);
00383         });
00384 
00385         if (nameEnd != pos)
00386         {
00387             std::string name(&*pos, nameEnd - pos);
00388             pos = nameEnd;
00389             return implementation::denormalize(name);
00390         }
00391 
00392         //unknown token
00393         throw XmlParsingError(posRow(), posCol());
00394     }
00395 
00396     std::string extractElementValue()
00397     {
00398         auto it = std::find_if(pos, stream_.end(), [](char c)
00399         {
00400             return c == '<'  ||
00401                    c == '>';
00402         });
00403         std::string output(pos, it);
00404         pos = it;
00405         return implementation::denormalize(output);
00406     }
00407 
00408     std::string extractAttributeValue()
00409     {
00410         auto it = std::find_if(pos, stream_.end(), [](char c)
00411         {
00412             return c == '<'  ||
00413                    c == '>'  ||
00414                    c == '\'' ||
00415                    c == '\"';
00416         });
00417         std::string output(pos, it);
00418         pos = it;
00419         return implementation::denormalize(output);
00420     }
00421 
00422     size_t posRow() const //current row beginning with 0
00423     {
00424         const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
00425         const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
00426         assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
00427         return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
00428     }
00429 
00430     size_t posCol() const //current col beginning with 0
00431     {
00432         //seek beginning of line
00433         for (auto it = pos; it != stream_.begin(); )
00434         {
00435             --it;
00436             if (*it == '\r' || *it == '\n')
00437                 return pos - it - 1;
00438         }
00439         return pos - stream_.begin();
00440     }
00441 
00442 private:
00443     Scanner(const Scanner&);
00444     Scanner& operator=(const Scanner&);
00445 
00446     bool startsWith(std::string::const_iterator it, const std::string& prefix) const
00447     {
00448         if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size()))
00449             return false;
00450         return std::equal(prefix.begin(), prefix.end(), it);
00451     }
00452 
00453     typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
00454     TokenList tokens;
00455 
00456     const std::string stream_;
00457     std::string::const_iterator pos;
00458 };
00459 
00460 
00461 class XmlParser
00462 {
00463 public:
00464     XmlParser(const std::string& stream) :
00465         scn(stream),
00466         tk(scn.nextToken()) {}
00467 
00468     void parse(XmlDoc& doc) //throw XmlParsingError
00469     {
00470         //declaration (optional)
00471         if (token().type == Token::TK_DECL_BEGIN)
00472         {
00473             nextToken();
00474 
00475             while (token().type == Token::TK_NAME)
00476             {
00477                 std::string attribName = token().name;
00478                 nextToken();
00479 
00480                 consumeToken(Token::TK_EQUAL);
00481                 expectToken(Token::TK_QUOTE);
00482                 std::string attribValue = scn.extractAttributeValue();
00483                 nextToken();
00484 
00485                 consumeToken(Token::TK_QUOTE);
00486 
00487                 if (attribName == "version")
00488                     doc.setVersion(attribValue);
00489                 else if (attribName == "encoding")
00490                     doc.setEncoding(attribValue);
00491                 else if (attribName == "standalone")
00492                     doc.setStandalone(attribValue);
00493             }
00494             consumeToken(Token::TK_DECL_END);
00495         }
00496 
00497         XmlDoc dummy;
00498         XmlElement& elemTmp = dummy.root();
00499         parseChildElements(elemTmp);
00500 
00501         auto iterPair = elemTmp.getChildren();
00502         if (iterPair.first != iterPair.second)
00503             doc.root().swap(*iterPair.first);
00504 
00505         expectToken(Token::TK_END);
00506     };
00507 
00508 private:
00509     XmlParser(const XmlParser&);
00510     XmlParser& operator=(const XmlParser&);
00511 
00512     void parseChildElements(XmlElement& parent)
00513     {
00514         while (token().type == Token::TK_LESS)
00515         {
00516             nextToken();
00517 
00518             expectToken(Token::TK_NAME);
00519             std::string elementName = token().name;
00520             nextToken();
00521 
00522             XmlElement& newElement = parent.addChild(elementName);
00523 
00524             parseAttributes(newElement);
00525 
00526             if (token().type == Token::TK_SLASH_GREATER) //empty element
00527             {
00528                 nextToken();
00529                 continue;
00530             }
00531 
00532             expectToken(Token::TK_GREATER);
00533             std::string elementValue = scn.extractElementValue();
00534             nextToken();
00535 
00536             //no support for mixed-mode content
00537             if (token().type == Token::TK_LESS) //structured element
00538                 parseChildElements(newElement);
00539             else //value element
00540                 newElement.setValue(elementValue);
00541 
00542             consumeToken(Token::TK_LESS_SLASH);
00543 
00544             if (token().type != Token::TK_NAME ||
00545                 elementName != token().name)
00546                 throw XmlParsingError(scn.posRow(), scn.posCol());
00547             nextToken();
00548 
00549             consumeToken(Token::TK_GREATER);
00550         }
00551     };
00552 
00553     void parseAttributes(XmlElement& element)
00554     {
00555         while (token().type == Token::TK_NAME)
00556         {
00557             std::string attribName = token().name;
00558             nextToken();
00559 
00560             consumeToken(Token::TK_EQUAL);
00561             expectToken(Token::TK_QUOTE);
00562             std::string attribValue = scn.extractAttributeValue();
00563             nextToken();
00564 
00565             consumeToken(Token::TK_QUOTE);
00566             element.setAttribute(attribName, attribValue);
00567         }
00568     }
00569 
00570     const Token& token() const { return tk; }
00571     void nextToken() { tk = scn.nextToken(); }
00572 
00573     void consumeToken(Token::Type t) //throw XmlParsingError
00574     {
00575         expectToken(t); //throw XmlParsingError
00576         nextToken();
00577     }
00578 
00579     void expectToken(Token::Type t) //throw XmlParsingError
00580     {
00581         if (token().type != t)
00582             throw XmlParsingError(scn.posRow(), scn.posCol());
00583     }
00584 
00585     Scanner scn;
00586     Token tk;
00587 };
00588 }
00589 
00590 inline
00591 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError
00592 {
00593     implementation::XmlParser(stream).parse(doc);  //throw XmlParsingError
00594 }
00595 }
00596 
00597 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432