zen::Xml
Simple C++ XML Processing
|
00001 // ************************************************************************** 00002 // * This file is part of the zen::Xml project. It is distributed under the * 00003 // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt * 00004 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * 00005 // ************************************************************************** 00006 00007 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432 00008 #define ZEN_XML_PARSER_HEADER_81248670213764583021432 00009 00010 #include <cstdio> 00011 #include <cstddef> //ptrdiff_t; req. on Linux 00012 #include <zen/string_traits.h> 00013 #include "dom.h" 00014 #include "error.h" 00015 00016 namespace zen 00017 { 00023 00024 00030 std::string serialize(const XmlDoc& doc, 00031 const std::string& lineBreak = "\r\n", 00032 const std::string& indent = " "); //throw () 00033 00035 struct XmlParsingError : public XmlError 00036 { 00037 XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {} 00039 size_t row; //beginning with 0 00041 size_t col; // 00042 }; 00043 00044 00046 00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError 00052 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 00063 00064 00065 00066 00067 00068 00069 00070 00071 00072 //---------------------------- implementation ---------------------------- 00073 //see: http://www.w3.org/TR/xml/ 00074 00075 namespace implementation 00076 { 00077 inline 00078 std::pair<char, char> hexify(unsigned char c) 00079 { 00080 auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F 00081 { 00082 assert(0 <= num&& num <= 15); //guaranteed by design below! 00083 return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here! 00084 '0' + num : 00085 'A' + (num - 10)); 00086 }; 00087 return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16)); 00088 } 00089 00090 00091 inline 00092 char unhexify(char high, char low) 00093 { 00094 auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15] 00095 { 00096 if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here! 00097 return hex - '0'; 00098 else if ('A' <= hex && hex <= 'F') 00099 return (hex - 'A') + 10; 00100 else if ('a' <= hex && hex <= 'f') 00101 return (hex - 'a') + 10; 00102 assert(false); 00103 return 0; 00104 }; 00105 return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed) 00106 }; 00107 00108 00109 template <class Predicate> inline 00110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex 00111 { 00112 std::string output; 00113 std::for_each(str.begin(), str.end(), 00114 [&](char c) 00115 { 00116 if (c == '&') // 00117 output += "&"; 00118 else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax 00119 output += "<"; 00120 else if (c == '>') // 00121 output += ">"; 00122 else if (pred(c)) 00123 { 00124 if (c == '\'') 00125 output += "'"; 00126 else if (c == '\"') 00127 output += """; 00128 else 00129 { 00130 output += "&#x"; 00131 const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3! 00132 output += hexDigits.first; 00133 output += hexDigits.second; 00134 output += ';'; 00135 } 00136 } 00137 else 00138 output += c; 00139 }); 00140 return output; 00141 } 00142 00143 inline 00144 std::string normalizeName(const std::string& str) 00145 { 00146 return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; }); 00147 } 00148 00149 inline 00150 std::string normalizeElementValue(const std::string& str) 00151 { 00152 return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; }); 00153 } 00154 00155 inline 00156 std::string normalizeAttribValue(const std::string& str) 00157 { 00158 return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; }); 00159 } 00160 00161 00162 template <class CharIterator, size_t N> inline 00163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N]) 00164 { 00165 assert(placeholder[N - 1] == 0); 00166 const ptrdiff_t strLen = N - 1; //don't count null-terminator 00167 if (last - first >= strLen && std::equal(first, first + strLen, placeholder)) 00168 { 00169 first += strLen - 1; 00170 return true; 00171 } 00172 return false; 00173 } 00174 00175 00176 namespace 00177 { 00178 std::string denormalize(const std::string& str) 00179 { 00180 std::string output; 00181 for (auto it = str.begin(); it != str.end(); ++it) 00182 { 00183 const char c = *it; 00184 00185 if (c == '&') 00186 { 00187 if (checkEntity(it, str.end(), "&")) 00188 output += '&'; 00189 else if (checkEntity(it, str.end(), "<")) 00190 output += '<'; 00191 else if (checkEntity(it, str.end(), ">")) 00192 output += '>'; 00193 else if (checkEntity(it, str.end(), "'")) 00194 output += '\''; 00195 else if (checkEntity(it, str.end(), """)) 00196 output += '\"'; 00197 else if (str.end() - it >= 6 && 00198 it[1] == '#' && 00199 it[2] == 'x' && 00200 it[5] == ';') 00201 { 00202 output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!! 00203 it += 5; 00204 } 00205 else 00206 output += c; //unexpected char! 00207 } 00208 else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends 00209 { 00210 auto itNext = it + 1; 00211 if (itNext != str.end() && *itNext == '\n') 00212 ++it; 00213 output += '\n'; 00214 } 00215 else 00216 output += c; 00217 }; 00218 return output; 00219 } 00220 00221 00222 void serialize(const XmlElement& element, std::string& stream, 00223 const std::string& lineBreak, 00224 const std::string& indent, 00225 size_t indentLevel) 00226 { 00227 const std::string& nameFmt = normalizeName(element.getNameAs<std::string>()); 00228 00229 for (size_t i = 0; i < indentLevel; ++i) 00230 stream += indent; 00231 00232 stream += '<' + nameFmt; 00233 00234 auto attr = element.getAttributes(); 00235 for (auto it = attr.first; it != attr.second; ++it) 00236 stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"'; 00237 00238 //no support for mixed-mode content 00239 auto iterPair = element.getChildren(); 00240 if (iterPair.first != iterPair.second) //structured element 00241 { 00242 stream += '>' + lineBreak; 00243 00244 std::for_each(iterPair.first, iterPair.second, 00245 [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); }); 00246 00247 for (size_t i = 0; i < indentLevel; ++i) 00248 stream += indent; 00249 stream += "</" + nameFmt + '>' + lineBreak; 00250 } 00251 else 00252 { 00253 std::string value; 00254 element.getValue(value); 00255 00256 if (!value.empty()) //value element 00257 stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak; 00258 else //empty element 00259 stream += "/>" + lineBreak; 00260 } 00261 } 00262 00263 std::string serialize(const XmlDoc& doc, 00264 const std::string& lineBreak, 00265 const std::string& indent) 00266 { 00267 std::string version = doc.getVersionAs<std::string>(); 00268 if (!version.empty()) 00269 version = " version=\"" + normalizeAttribValue(version) + '\"'; 00270 00271 std::string encoding = doc.getEncodingAs<std::string>(); 00272 if (!encoding.empty()) 00273 encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"'; 00274 00275 std::string standalone = doc.getStandaloneAs<std::string>(); 00276 if (!standalone.empty()) 00277 standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"'; 00278 00279 std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak; 00280 serialize(doc.root(), output, lineBreak, indent, 0); 00281 return output; 00282 } 00283 } 00284 } 00285 00286 inline 00287 std::string serialize(const XmlDoc& doc, 00288 const std::string& lineBreak, 00289 const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); } 00290 00291 /* 00292 Grammar for XML parser 00293 ------------------------------- 00294 document-expression: 00295 <?xml version="1.0" encoding="UTF-8" standalone="yes"?> 00296 element-expression: 00297 00298 element-expression: 00299 <string attributes-expression/> 00300 <string attributes-expression> pm-expression </string> 00301 00302 element-list-expression: 00303 <empty> 00304 element-expression element-list-expression 00305 00306 attributes-expression: 00307 <empty> 00308 string="string" attributes-expression 00309 00310 pm-expression: 00311 string 00312 element-list-expression 00313 */ 00314 00315 namespace implementation 00316 { 00317 struct Token 00318 { 00319 enum Type 00320 { 00321 TK_LESS, 00322 TK_GREATER, 00323 TK_LESS_SLASH, 00324 TK_SLASH_GREATER, 00325 TK_EQUAL, 00326 TK_QUOTE, 00327 TK_DECL_BEGIN, 00328 TK_DECL_END, 00329 TK_NAME, 00330 TK_END 00331 }; 00332 00333 Token(Type t) : type(t) {} 00334 Token(const std::string& txt) : type(TK_NAME), name(txt) {} 00335 00336 Type type; 00337 std::string name; //filled if type == TK_NAME 00338 }; 00339 00340 class Scanner 00341 { 00342 public: 00343 Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin()) 00344 { 00345 if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8)) 00346 pos += strLength(BYTE_ORDER_MARK_UTF8); 00347 00348 tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN)); 00349 tokens.push_back(std::make_pair("?>", Token::TK_DECL_END)); 00350 tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH)); 00351 tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER)); 00352 tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN! 00353 tokens.push_back(std::make_pair(">" , Token::TK_GREATER)); 00354 tokens.push_back(std::make_pair("=" , Token::TK_EQUAL)); 00355 tokens.push_back(std::make_pair("\"", Token::TK_QUOTE)); 00356 tokens.push_back(std::make_pair("\'", Token::TK_QUOTE)); 00357 } 00358 00359 Token nextToken() //throw XmlParsingError 00360 { 00361 //skip whitespace 00362 pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); }); 00363 00364 if (pos == stream_.end()) 00365 return Token::TK_END; 00366 00367 for (auto it = tokens.begin(); it != tokens.end(); ++it) 00368 if (startsWith(pos, it->first)) 00369 { 00370 pos += it->first.size(); 00371 return it->second; 00372 } 00373 00374 auto nameEnd = std::find_if(pos, stream_.end(), [](char c) 00375 { 00376 return c == '<' || 00377 c == '>' || 00378 c == '=' || 00379 c == '/' || 00380 c == '\'' || 00381 c == '\"' || 00382 zen::isWhiteSpace(c); 00383 }); 00384 00385 if (nameEnd != pos) 00386 { 00387 std::string name(&*pos, nameEnd - pos); 00388 pos = nameEnd; 00389 return implementation::denormalize(name); 00390 } 00391 00392 //unknown token 00393 throw XmlParsingError(posRow(), posCol()); 00394 } 00395 00396 std::string extractElementValue() 00397 { 00398 auto it = std::find_if(pos, stream_.end(), [](char c) 00399 { 00400 return c == '<' || 00401 c == '>'; 00402 }); 00403 std::string output(pos, it); 00404 pos = it; 00405 return implementation::denormalize(output); 00406 } 00407 00408 std::string extractAttributeValue() 00409 { 00410 auto it = std::find_if(pos, stream_.end(), [](char c) 00411 { 00412 return c == '<' || 00413 c == '>' || 00414 c == '\'' || 00415 c == '\"'; 00416 }); 00417 std::string output(pos, it); 00418 pos = it; 00419 return implementation::denormalize(output); 00420 } 00421 00422 size_t posRow() const //current row beginning with 0 00423 { 00424 const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns 00425 const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines 00426 assert(crSum == 0 || nlSum == 0 || crSum == nlSum); 00427 return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win 00428 } 00429 00430 size_t posCol() const //current col beginning with 0 00431 { 00432 //seek beginning of line 00433 for (auto it = pos; it != stream_.begin(); ) 00434 { 00435 --it; 00436 if (*it == '\r' || *it == '\n') 00437 return pos - it - 1; 00438 } 00439 return pos - stream_.begin(); 00440 } 00441 00442 private: 00443 Scanner(const Scanner&); 00444 Scanner& operator=(const Scanner&); 00445 00446 bool startsWith(std::string::const_iterator it, const std::string& prefix) const 00447 { 00448 if (stream_.end() - it < static_cast<ptrdiff_t>(prefix.size())) 00449 return false; 00450 return std::equal(prefix.begin(), prefix.end(), it); 00451 } 00452 00453 typedef std::vector<std::pair<std::string, Token::Type> > TokenList; 00454 TokenList tokens; 00455 00456 const std::string stream_; 00457 std::string::const_iterator pos; 00458 }; 00459 00460 00461 class XmlParser 00462 { 00463 public: 00464 XmlParser(const std::string& stream) : 00465 scn(stream), 00466 tk(scn.nextToken()) {} 00467 00468 void parse(XmlDoc& doc) //throw XmlParsingError 00469 { 00470 //declaration (optional) 00471 if (token().type == Token::TK_DECL_BEGIN) 00472 { 00473 nextToken(); 00474 00475 while (token().type == Token::TK_NAME) 00476 { 00477 std::string attribName = token().name; 00478 nextToken(); 00479 00480 consumeToken(Token::TK_EQUAL); 00481 expectToken(Token::TK_QUOTE); 00482 std::string attribValue = scn.extractAttributeValue(); 00483 nextToken(); 00484 00485 consumeToken(Token::TK_QUOTE); 00486 00487 if (attribName == "version") 00488 doc.setVersion(attribValue); 00489 else if (attribName == "encoding") 00490 doc.setEncoding(attribValue); 00491 else if (attribName == "standalone") 00492 doc.setStandalone(attribValue); 00493 } 00494 consumeToken(Token::TK_DECL_END); 00495 } 00496 00497 XmlDoc dummy; 00498 XmlElement& elemTmp = dummy.root(); 00499 parseChildElements(elemTmp); 00500 00501 auto iterPair = elemTmp.getChildren(); 00502 if (iterPair.first != iterPair.second) 00503 doc.root().swap(*iterPair.first); 00504 00505 expectToken(Token::TK_END); 00506 }; 00507 00508 private: 00509 XmlParser(const XmlParser&); 00510 XmlParser& operator=(const XmlParser&); 00511 00512 void parseChildElements(XmlElement& parent) 00513 { 00514 while (token().type == Token::TK_LESS) 00515 { 00516 nextToken(); 00517 00518 expectToken(Token::TK_NAME); 00519 std::string elementName = token().name; 00520 nextToken(); 00521 00522 XmlElement& newElement = parent.addChild(elementName); 00523 00524 parseAttributes(newElement); 00525 00526 if (token().type == Token::TK_SLASH_GREATER) //empty element 00527 { 00528 nextToken(); 00529 continue; 00530 } 00531 00532 expectToken(Token::TK_GREATER); 00533 std::string elementValue = scn.extractElementValue(); 00534 nextToken(); 00535 00536 //no support for mixed-mode content 00537 if (token().type == Token::TK_LESS) //structured element 00538 parseChildElements(newElement); 00539 else //value element 00540 newElement.setValue(elementValue); 00541 00542 consumeToken(Token::TK_LESS_SLASH); 00543 00544 if (token().type != Token::TK_NAME || 00545 elementName != token().name) 00546 throw XmlParsingError(scn.posRow(), scn.posCol()); 00547 nextToken(); 00548 00549 consumeToken(Token::TK_GREATER); 00550 } 00551 }; 00552 00553 void parseAttributes(XmlElement& element) 00554 { 00555 while (token().type == Token::TK_NAME) 00556 { 00557 std::string attribName = token().name; 00558 nextToken(); 00559 00560 consumeToken(Token::TK_EQUAL); 00561 expectToken(Token::TK_QUOTE); 00562 std::string attribValue = scn.extractAttributeValue(); 00563 nextToken(); 00564 00565 consumeToken(Token::TK_QUOTE); 00566 element.setAttribute(attribName, attribValue); 00567 } 00568 } 00569 00570 const Token& token() const { return tk; } 00571 void nextToken() { tk = scn.nextToken(); } 00572 00573 void consumeToken(Token::Type t) //throw XmlParsingError 00574 { 00575 expectToken(t); //throw XmlParsingError 00576 nextToken(); 00577 } 00578 00579 void expectToken(Token::Type t) //throw XmlParsingError 00580 { 00581 if (token().type != t) 00582 throw XmlParsingError(scn.posRow(), scn.posCol()); 00583 } 00584 00585 Scanner scn; 00586 Token tk; 00587 }; 00588 } 00589 00590 inline 00591 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError 00592 { 00593 implementation::XmlParser(stream).parse(doc); //throw XmlParsingError 00594 } 00595 } 00596 00597 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432