zen::Xml
Simple C++ XML Processing
 All Classes Namespaces Functions Variables Pages
parser.h
1 // **************************************************************************
2 // * This file is part of the FreeFileSync project. It is distributed under *
3 // * GNU General Public License: http://www.gnu.org/licenses/gpl.html *
4 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
5 // **************************************************************************
6 
7 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
8 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
9 
10 #include <cstdio>
11 #include <cstddef> //ptrdiff_t; req. on Linux
12 #include <zen/string_traits.h>
13 #include "dom.h"
14 #include "error.h"
15 
16 namespace zen
17 {
23 
30 std::string serialize(const XmlDoc& doc,
31  const std::string& lineBreak = "\r\n",
32  const std::string& indent = " "); //throw ()
33 
35 struct XmlParsingError : public XmlError
36 {
37  XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
39  size_t row; //beginning with 0
41  size_t col; //
42 };
43 
44 
46 
51 XmlDoc parse(const std::string& stream); //throw XmlParsingError
52 
53 
54 
55 
56 
57 
58 
59 
60 
61 
62 
63 
64 
65 
66 
67 
68 
69 
70 
71 
72 //---------------------------- implementation ----------------------------
73 //see: http://www.w3.org/TR/xml/
74 
75 namespace implementation
76 {
77 inline
78 std::pair<char, char> hexify(unsigned char c)
79 {
80  auto hexifyDigit = [](int num) -> char //input [0, 15], output 0-9, A-F
81  {
82  assert(0 <= num&& num <= 15); //guaranteed by design below!
83  return static_cast<char>(num <= 9 ? //no signed/unsigned char problem here!
84  '0' + num :
85  'A' + (num - 10));
86  };
87  return std::make_pair(hexifyDigit(c / 16), hexifyDigit(c % 16));
88 }
89 
90 
91 inline
92 char unhexify(char high, char low)
93 {
94  auto unhexifyDigit = [](char hex) -> int //input 0-9, a-f, A-F; output range: [0, 15]
95  {
96  if ('0' <= hex && hex <= '9') //no signed/unsigned char problem here!
97  return hex - '0';
98  else if ('A' <= hex && hex <= 'F')
99  return (hex - 'A') + 10;
100  else if ('a' <= hex && hex <= 'f')
101  return (hex - 'a') + 10;
102  assert(false);
103  return 0;
104  };
105  return static_cast<unsigned char>(16 * unhexifyDigit(high) + unhexifyDigit(low)); //[!] convert to unsigned char first, then to char (which may be signed)
106 };
107 
108 
109 template <class Predicate> inline
110 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
111 {
112  std::string output;
113  std::for_each(str.begin(), str.end(),
114  [&](char c)
115  {
116  if (c == '&') //
117  output += "&amp;";
118  else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
119  output += "&lt;";
120  else if (c == '>') //
121  output += "&gt;";
122  else if (pred(c))
123  {
124  if (c == '\'')
125  output += "&apos;";
126  else if (c == '\"')
127  output += "&quot;";
128  else
129  {
130  output += "&#x";
131  const auto hexDigits = hexify(c); //hexify beats "printNumber<std::string>("&#x%02X;", c)" by a nice factor of 3!
132  output += hexDigits.first;
133  output += hexDigits.second;
134  output += ';';
135  }
136  }
137  else
138  output += c;
139  });
140  return output;
141 }
142 
143 inline
144 std::string normalizeName(const std::string& str)
145 {
146  return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
147 }
148 
149 inline
150 std::string normalizeElementValue(const std::string& str)
151 {
152  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
153 }
154 
155 inline
156 std::string normalizeAttribValue(const std::string& str)
157 {
158  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
159 }
160 
161 
162 template <class CharIterator, size_t N> inline
163 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
164 {
165  assert(placeholder[N - 1] == 0);
166  const ptrdiff_t strLen = N - 1; //don't count null-terminator
167  if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
168  {
169  first += strLen - 1;
170  return true;
171  }
172  return false;
173 }
174 
175 
176 namespace
177 {
178 std::string denormalize(const std::string& str)
179 {
180  std::string output;
181  for (auto it = str.begin(); it != str.end(); ++it)
182  {
183  const char c = *it;
184 
185  if (c == '&')
186  {
187  if (checkEntity(it, str.end(), "&amp;"))
188  output += '&';
189  else if (checkEntity(it, str.end(), "&lt;"))
190  output += '<';
191  else if (checkEntity(it, str.end(), "&gt;"))
192  output += '>';
193  else if (checkEntity(it, str.end(), "&apos;"))
194  output += '\'';
195  else if (checkEntity(it, str.end(), "&quot;"))
196  output += '\"';
197  else if (str.end() - it >= 6 &&
198  it[1] == '#' &&
199  it[2] == 'x' &&
200  it[5] == ';')
201  {
202  output += unhexify(it[3], it[4]); //unhexify beats "::sscanf(&it[3], "%02X", &tmp)" by a factor of 3000 for ~250000 calls!!!
203  it += 5;
204  }
205  else
206  output += c; //unexpected char!
207  }
208  else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
209  {
210  auto itNext = it + 1;
211  if (itNext != str.end() && *itNext == '\n')
212  ++it;
213  output += '\n';
214  }
215  else
216  output += c;
217  };
218  return output;
219 }
220 
221 
222 void serialize(const XmlElement& element, std::string& stream,
223  const std::string& lineBreak,
224  const std::string& indent,
225  size_t indentLevel)
226 {
227  const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
228 
229  for (size_t i = 0; i < indentLevel; ++i)
230  stream += indent;
231 
232  stream += '<' + nameFmt;
233 
234  auto attr = element.getAttributes();
235  for (auto it = attr.first; it != attr.second; ++it)
236  stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
237 
238  //no support for mixed-mode content
239  auto iterPair = element.getChildren();
240  if (iterPair.first != iterPair.second) //structured element
241  {
242  stream += '>' + lineBreak;
243 
244  std::for_each(iterPair.first, iterPair.second,
245  [&](const XmlElement & el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
246 
247  for (size_t i = 0; i < indentLevel; ++i)
248  stream += indent;
249  stream += "</" + nameFmt + '>' + lineBreak;
250  }
251  else
252  {
253  std::string value;
254  element.getValue(value);
255 
256  if (!value.empty()) //value element
257  stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
258  else //empty element
259  stream += "/>" + lineBreak;
260  }
261 }
262 
263 std::string serialize(const XmlDoc& doc,
264  const std::string& lineBreak,
265  const std::string& indent)
266 {
267  std::string version = doc.getVersionAs<std::string>();
268  if (!version.empty())
269  version = " version=\"" + normalizeAttribValue(version) + '\"';
270 
271  std::string encoding = doc.getEncodingAs<std::string>();
272  if (!encoding.empty())
273  encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
274 
275  std::string standalone = doc.getStandaloneAs<std::string>();
276  if (!standalone.empty())
277  standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
278 
279  std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
280  serialize(doc.root(), output, lineBreak, indent, 0);
281  return output;
282 }
283 }
284 }
285 
286 inline
287 std::string serialize(const XmlDoc& doc,
288  const std::string& lineBreak,
289  const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
290 
291 /*
292 Grammar for XML parser
293 -------------------------------
294 document-expression:
295  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
296  element-expression:
297 
298 element-expression:
299  <string attributes-expression/>
300  <string attributes-expression> pm-expression </string>
301 
302 element-list-expression:
303  <empty>
304  element-expression element-list-expression
305 
306 attributes-expression:
307  <empty>
308  string="string" attributes-expression
309 
310 pm-expression:
311  string
312  element-list-expression
313 */
314 
315 namespace implementation
316 {
317 struct Token
318 {
319  enum Type
320  {
321  TK_LESS,
322  TK_GREATER,
323  TK_LESS_SLASH,
324  TK_SLASH_GREATER,
325  TK_EQUAL,
326  TK_QUOTE,
327  TK_DECL_BEGIN,
328  TK_DECL_END,
329  TK_NAME,
330  TK_END
331  };
332 
333  Token(Type t) : type(t) {}
334  Token(const std::string& txt) : type(TK_NAME), name(txt) {}
335 
336  Type type;
337  std::string name; //filled if type == TK_NAME
338 };
339 
340 class Scanner
341 {
342 public:
343  Scanner(const std::string& stream) :
344  xmlCommentBegin("<!--"),
345  xmlCommentEnd ("-->"),
346  stream_(stream),
347  pos(stream_.begin())
348  {
349  if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
350  pos += strLength(BYTE_ORDER_MARK_UTF8);
351 
352  tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
353  tokens.push_back(std::make_pair("?>", Token::TK_DECL_END));
354  tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
355  tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
356  tokens.push_back(std::make_pair("<" , Token::TK_LESS)); //evaluate after TK_DECL_BEGIN!
357  tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
358  tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
359  tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
360  tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
361  }
362 
363  Token nextToken() //throw XmlParsingError
364  {
365  //skip whitespace
366  pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
367 
368  if (pos == stream_.end())
369  return Token::TK_END;
370 
371  //skip XML comments
372  if (startsWith(xmlCommentBegin))
373  {
374  auto it = std::search(pos + xmlCommentBegin.size(), stream_.end(), xmlCommentEnd.begin(), xmlCommentEnd.end());
375  if (it != stream_.end())
376  {
377  pos = it + xmlCommentEnd.size();
378  return nextToken();
379  }
380  }
381 
382  for (auto it = tokens.begin(); it != tokens.end(); ++it)
383  if (startsWith(it->first))
384  {
385  pos += it->first.size();
386  return it->second;
387  }
388 
389  auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
390  {
391  return c == '<' ||
392  c == '>' ||
393  c == '=' ||
394  c == '/' ||
395  c == '\'' ||
396  c == '\"' ||
397  zen::isWhiteSpace(c);
398  });
399 
400  if (nameEnd != pos)
401  {
402  std::string name(&*pos, nameEnd - pos);
403  pos = nameEnd;
404  return implementation::denormalize(name);
405  }
406 
407  //unknown token
408  throw XmlParsingError(posRow(), posCol());
409  }
410 
411  std::string extractElementValue()
412  {
413  auto it = std::find_if(pos, stream_.end(), [](char c)
414  {
415  return c == '<' ||
416  c == '>';
417  });
418  std::string output(pos, it);
419  pos = it;
420  return implementation::denormalize(output);
421  }
422 
423  std::string extractAttributeValue()
424  {
425  auto it = std::find_if(pos, stream_.end(), [](char c)
426  {
427  return c == '<' ||
428  c == '>' ||
429  c == '\'' ||
430  c == '\"';
431  });
432  std::string output(pos, it);
433  pos = it;
434  return implementation::denormalize(output);
435  }
436 
437  size_t posRow() const //current row beginning with 0
438  {
439  const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
440  const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
441  assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
442  return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
443  }
444 
445  size_t posCol() const //current col beginning with 0
446  {
447  //seek beginning of line
448  for (auto it = pos; it != stream_.begin(); )
449  {
450  --it;
451  if (*it == '\r' || *it == '\n')
452  return pos - it - 1;
453  }
454  return pos - stream_.begin();
455  }
456 
457 private:
458  Scanner(const Scanner&);
459  Scanner& operator=(const Scanner&);
460 
461  bool startsWith(const std::string& prefix) const
462  {
463  if (stream_.end() - pos < static_cast<ptrdiff_t>(prefix.size()))
464  return false;
465  return std::equal(prefix.begin(), prefix.end(), pos);
466  }
467 
468  typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
469  TokenList tokens;
470 
471  const std::string xmlCommentBegin;
472  const std::string xmlCommentEnd;
473 
474  const std::string stream_;
475  std::string::const_iterator pos;
476 };
477 
478 
479 class XmlParser
480 {
481 public:
482  XmlParser(const std::string& stream) :
483  scn(stream),
484  tk(scn.nextToken()) {}
485 
486  XmlDoc parse() //throw XmlParsingError
487  {
488  XmlDoc doc;
489 
490  //declaration (optional)
491  if (token().type == Token::TK_DECL_BEGIN)
492  {
493  nextToken();
494 
495  while (token().type == Token::TK_NAME)
496  {
497  std::string attribName = token().name;
498  nextToken();
499 
500  consumeToken(Token::TK_EQUAL);
501  expectToken(Token::TK_QUOTE);
502  std::string attribValue = scn.extractAttributeValue();
503  nextToken();
504 
505  consumeToken(Token::TK_QUOTE);
506 
507  if (attribName == "version")
508  doc.setVersion(attribValue);
509  else if (attribName == "encoding")
510  doc.setEncoding(attribValue);
511  else if (attribName == "standalone")
512  doc.setStandalone(attribValue);
513  }
514  consumeToken(Token::TK_DECL_END);
515  }
516 
517  XmlDoc dummy;
518  XmlElement& elemTmp = dummy.root();
519  parseChildElements(elemTmp);
520 
521  auto iterPair = elemTmp.getChildren();
522  if (iterPair.first != iterPair.second)
523  doc.root().swap(*iterPair.first);
524 
525  expectToken(Token::TK_END);
526  return doc;
527  };
528 
529 private:
530  XmlParser(const XmlParser&);
531  XmlParser& operator=(const XmlParser&);
532 
533  void parseChildElements(XmlElement& parent)
534  {
535  while (token().type == Token::TK_LESS)
536  {
537  nextToken();
538 
539  expectToken(Token::TK_NAME);
540  std::string elementName = token().name;
541  nextToken();
542 
543  XmlElement& newElement = parent.addChild(elementName);
544 
545  parseAttributes(newElement);
546 
547  if (token().type == Token::TK_SLASH_GREATER) //empty element
548  {
549  nextToken();
550  continue;
551  }
552 
553  expectToken(Token::TK_GREATER);
554  std::string elementValue = scn.extractElementValue();
555  nextToken();
556 
557  //no support for mixed-mode content
558  if (token().type == Token::TK_LESS) //structured element
559  parseChildElements(newElement);
560  else //value element
561  newElement.setValue(elementValue);
562 
563  consumeToken(Token::TK_LESS_SLASH);
564 
565  if (token().type != Token::TK_NAME ||
566  elementName != token().name)
567  throw XmlParsingError(scn.posRow(), scn.posCol());
568  nextToken();
569 
570  consumeToken(Token::TK_GREATER);
571  }
572  };
573 
574  void parseAttributes(XmlElement& element)
575  {
576  while (token().type == Token::TK_NAME)
577  {
578  std::string attribName = token().name;
579  nextToken();
580 
581  consumeToken(Token::TK_EQUAL);
582  expectToken(Token::TK_QUOTE);
583  std::string attribValue = scn.extractAttributeValue();
584  nextToken();
585 
586  consumeToken(Token::TK_QUOTE);
587  element.setAttribute(attribName, attribValue);
588  }
589  }
590 
591  const Token& token() const { return tk; }
592  void nextToken() { tk = scn.nextToken(); }
593 
594  void consumeToken(Token::Type t) //throw XmlParsingError
595  {
596  expectToken(t); //throw XmlParsingError
597  nextToken();
598  }
599 
600  void expectToken(Token::Type t) //throw XmlParsingError
601  {
602  if (token().type != t)
603  throw XmlParsingError(scn.posRow(), scn.posCol());
604  }
605 
606  Scanner scn;
607  Token tk;
608 };
609 }
610 
611 inline
612 XmlDoc parse(const std::string& stream) //throw XmlParsingError
613 {
614  return implementation::XmlParser(stream).parse(); //throw XmlParsingError
615 }
616 }
617 
618 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
XmlDoc parse(const std::string &stream)
Load XML document from a byte stream.
Definition: parser.h:612
std::string serialize(const XmlDoc &doc, const std::string &lineBreak="\r\n", const std::string &indent=" ")
Save XML document as a byte stream.
Definition: parser.h:287
size_t row
Input file row where the parsing error occured (zero-based)
Definition: parser.h:39
The complete XML document.
Definition: dom.h:249
size_t col
Input file column where the parsing error occured (zero-based)
Definition: parser.h:41
Exception thrown due to an XML parsing error.
Definition: parser.h:35
Exception base class for zen::Xml.
Definition: error.h:13