summaryrefslogtreecommitdiff
path: root/lib/parse_lng.h
diff options
context:
space:
mode:
authorDaniel Wilhelm <daniel@wili.li>2014-04-18 17:15:16 +0200
committerDaniel Wilhelm <daniel@wili.li>2014-04-18 17:15:16 +0200
commitbd6336c629841c6db3a6ca53a936d629d34db53b (patch)
tree3721ef997864108df175ce677a8a7d4342a6f1d2 /lib/parse_lng.h
parent4.0 (diff)
downloadFreeFileSync-bd6336c629841c6db3a6ca53a936d629d34db53b.tar.gz
FreeFileSync-bd6336c629841c6db3a6ca53a936d629d34db53b.tar.bz2
FreeFileSync-bd6336c629841c6db3a6ca53a936d629d34db53b.zip
4.1
Diffstat (limited to 'lib/parse_lng.h')
-rw-r--r--lib/parse_lng.h608
1 files changed, 608 insertions, 0 deletions
diff --git a/lib/parse_lng.h b/lib/parse_lng.h
new file mode 100644
index 00000000..811a3181
--- /dev/null
+++ b/lib/parse_lng.h
@@ -0,0 +1,608 @@
+// **************************************************************************
+// * This file is part of the FreeFileSync project. It is distributed under *
+// * GNU General Public License: http://www.gnu.org/licenses/gpl.html *
+// * Copyright (C) 2008-2011 ZenJu (zhnmju123 AT gmx.de) *
+// **************************************************************************
+
+#ifndef PARSE_LNG_HEADER_INCLUDED
+#define PARSE_LNG_HEADER_INCLUDED
+
+#include <algorithm>
+#include <cctype>
+#include <functional>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <zen/utf8.h>
+
+namespace lngfile
+{
+//singular forms
+typedef std::map <std::string, std::string> TranslationMap; //orig |-> translation
+
+//plural forms
+typedef std::pair<std::string, std::string> SingularPluralPair; //1 house| n houses
+typedef std::vector<std::string> PluralForms; //1 dom | 2 domy | 5 domów
+typedef std::map <SingularPluralPair, PluralForms> TranslationPluralMap; //(sing/plu) |-> pluralforms
+
+struct TransHeader
+{
+ std::string languageName; //display name: "English (UK)"
+ std::string translatorName; //"ZenJu"
+ std::string localeName; //ISO 639 language code + ISO 3166 country code, e.g. "en_GB", or "en_US"
+ std::string flagFile; //"england.png"
+ int pluralCount; //2
+ std::string pluralDefinition; //"n == 1 ? 0 : 1"
+};
+
+
+struct ParsingError
+{
+ ParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
+ size_t row;
+ size_t col;
+};
+void parseLng(const std::string& fileStream, TransHeader& header, TranslationMap& out, TranslationPluralMap& pluralOut); //throw ParsingError
+void parseHeader(const std::string& fileStream, TransHeader& header); //throw ParsingError
+
+class TranslationList; //unordered list of unique translation items
+void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//--------------------------- implementation ---------------------------
+class TranslationList //unordered list of unique translation items
+{
+public:
+ void addItem(const std::string& orig, const std::string& trans)
+ {
+ if (!transUnique.insert(orig).second) return;
+
+ dump.push_back(RegularItem(std::make_pair(orig, trans)));
+ sequence.push_back(&dump.back());
+ }
+ void addPluralItem(const SingularPluralPair& orig, const PluralForms& trans)
+ {
+ if (!pluralUnique.insert(orig).second) return;
+
+ dumpPlural.push_back(PluralItem(std::make_pair(orig, trans)));
+ sequence.push_back(&dumpPlural.back());
+ }
+
+ bool untranslatedTextExists() const
+ {
+ for (std::list<RegularItem>::const_iterator i = dump.begin(); i != dump.end(); ++i)
+ if (i->value.second.empty())
+ return true;
+ for (std::list<PluralItem>::const_iterator i = dumpPlural.begin(); i != dumpPlural.end(); ++i)
+ if (i->value.second.empty())
+ return true;
+ return false;
+ }
+
+private:
+ friend void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream);
+
+ struct Item {virtual ~Item() {} };
+ struct RegularItem : public Item { RegularItem(const TranslationMap ::value_type& val) : value(val) {} TranslationMap ::value_type value; };
+ struct PluralItem : public Item { PluralItem (const TranslationPluralMap::value_type& val) : value(val) {} TranslationPluralMap::value_type value; };
+
+ std::vector<Item*> sequence; //dynamic list of translation elements
+ std::list<RegularItem> dump; //manage memory
+ std::list<PluralItem> dumpPlural; //manage memory
+
+ std::set<TranslationMap ::key_type> transUnique; //check uniqueness
+ std::set<TranslationPluralMap::key_type> pluralUnique; //
+};
+
+
+struct Token
+{
+ enum Type
+ {
+ //header information
+ TK_HEADER_BEGIN,
+ TK_HEADER_END,
+ TK_LANG_NAME_BEGIN,
+ TK_LANG_NAME_END,
+ TK_TRANS_NAME_BEGIN,
+ TK_TRANS_NAME_END,
+ TK_LOCALE_NAME_BEGIN,
+ TK_LOCALE_NAME_END,
+ TK_FLAG_FILE_BEGIN,
+ TK_FLAG_FILE_END,
+ TK_PLURAL_COUNT_BEGIN,
+ TK_PLURAL_COUNT_END,
+ TK_PLURAL_DEF_BEGIN,
+ TK_PLURAL_DEF_END,
+
+ //item level
+ TK_SRC_BEGIN,
+ TK_SRC_END,
+ TK_TRG_BEGIN,
+ TK_TRG_END,
+ TK_TEXT,
+ TK_PLURAL_BEGIN,
+ TK_PLURAL_END,
+ TK_END
+ };
+
+ Token(Type t) : type(t) {}
+ Type type;
+
+ std::string text;
+};
+
+
+class KnownTokens
+{
+public:
+ typedef std::map<Token::Type, std::string> TokenMap;
+
+ static const TokenMap& asList()
+ {
+ static KnownTokens inst;
+ return inst.tokens;
+ }
+
+ static std::string text(Token::Type t)
+ {
+ TokenMap::const_iterator iter = asList().find(t);
+ return iter != asList().end() ? iter->second : std::string();
+ }
+
+private:
+ KnownTokens()
+ {
+ //header information
+ tokens.insert(std::make_pair(Token::TK_HEADER_BEGIN, "<header>"));
+ tokens.insert(std::make_pair(Token::TK_HEADER_END, "</header>"));
+ tokens.insert(std::make_pair(Token::TK_LANG_NAME_BEGIN, "<language name>"));
+ tokens.insert(std::make_pair(Token::TK_LANG_NAME_END, "</language name>"));
+ tokens.insert(std::make_pair(Token::TK_TRANS_NAME_BEGIN, "<translator>"));
+ tokens.insert(std::make_pair(Token::TK_TRANS_NAME_END, "</translator>"));
+ tokens.insert(std::make_pair(Token::TK_LOCALE_NAME_BEGIN, "<locale>"));
+ tokens.insert(std::make_pair(Token::TK_LOCALE_NAME_END, "</locale>"));
+ tokens.insert(std::make_pair(Token::TK_FLAG_FILE_BEGIN, "<flag file>"));
+ tokens.insert(std::make_pair(Token::TK_FLAG_FILE_END, "</flag file>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_COUNT_BEGIN, "<plural forms>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_COUNT_END, "</plural forms>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_DEF_BEGIN, "<plural definition>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_DEF_END, "</plural definition>"));
+
+ //item level
+ tokens.insert(std::make_pair(Token::TK_SRC_BEGIN, "<source>"));
+ tokens.insert(std::make_pair(Token::TK_SRC_END, "</source>"));
+ tokens.insert(std::make_pair(Token::TK_TRG_BEGIN, "<target>"));
+ tokens.insert(std::make_pair(Token::TK_TRG_END, "</target>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_BEGIN, "<pluralform>"));
+ tokens.insert(std::make_pair(Token::TK_PLURAL_END, "</pluralform>"));
+ }
+ TokenMap tokens;
+};
+
+struct IsWhiteSpace : public std::unary_function<char, bool>
+{
+ bool operator()(char c) const
+ {
+ const unsigned char usc = c; //caveat 1: std::isspace() takes an int, but expects an unsigned char
+ return usc < 128 && //caveat 2: some parts of UTF-8 chars are erroneously seen as whitespace, e.g. the a0 from "\xec\x8b\a0" (MSVC)
+ std::isspace(usc) != 0; //[!]
+ }
+};
+
+class Scanner
+{
+public:
+ Scanner(const std::string& fileStream) : stream(fileStream), pos(stream.begin()) {}
+
+ Token nextToken()
+ {
+ //skip whitespace
+ pos = std::find_if(pos, stream.end(), std::not1(IsWhiteSpace()));
+
+ if (pos == stream.end())
+ return Token(Token::TK_END);
+
+ for (KnownTokens::TokenMap::const_iterator i = KnownTokens::asList().begin(); i != KnownTokens::asList().end(); ++i)
+ if (startsWith(i->second))
+ {
+ pos += i->second.size();
+ return Token(i->first);
+ }
+
+ //rest must be "text"
+ std::string::const_iterator textBegin = pos;
+ while (pos != stream.end() && !startsWithKnownTag())
+ pos = std::find(pos + 1, stream.end(), '<');
+
+ std::string text(textBegin, pos);
+
+ normalize(text); //remove whitespace from end ect.
+
+ if (text.empty() && pos == stream.end())
+ return Token(Token::TK_END);
+
+ Token out(Token::TK_TEXT);
+ out.text = text;
+ return out;
+ }
+
+ std::pair<size_t, size_t> position() const //current (row/col) beginning with 1
+ {
+ //seek last line break
+ std::string::const_iterator iter = pos;
+ while (iter != stream.begin() && *iter != '\n')
+ --iter;
+
+ return std::make_pair(std::count(stream.begin(), pos, '\n') + 1, pos - iter);
+ }
+
+private:
+ bool startsWithKnownTag() const
+ {
+ for (KnownTokens::TokenMap::const_iterator i = KnownTokens::asList().begin(); i != KnownTokens::asList().end(); ++i)
+ if (startsWith(i->second))
+ return true;
+ return false;
+ }
+
+ bool startsWith(const std::string& prefix) const
+ {
+ if (stream.end() - pos < static_cast<int>(prefix.size()))
+ return false;
+ return std::equal(prefix.begin(), prefix.end(), pos);
+ }
+
+ static void normalize(std::string& text)
+ {
+ //remmove whitespace from end
+ while (!text.empty() && IsWhiteSpace()(*text.rbegin()))
+ text.resize(text.size() - 1);
+
+ //ensure c-style line breaks
+
+ //Delimiter:
+ //----------
+ //Linux: 0xA \n
+ //Mac: 0xD \r
+ //Win: 0xD 0xA \r\n <- language files are in Windows format
+ if (text.find('\r') != std::string::npos)
+ {
+ std::string tmp;
+ for (std::string::const_iterator i = text.begin(); i != text.end(); ++i)
+ if (*i == '\r')
+ {
+ std::string::const_iterator next = i + 1;
+ if (next != text.end() && *next == '\n')
+ ++i;
+ tmp += '\n';
+ }
+ else
+ tmp += *i;
+ text = tmp;
+ }
+ }
+
+ const std::string stream;
+ std::string::const_iterator pos;
+};
+
+template <class C, class T>
+inline
+std::basic_string<C> numberToString(const T& number) //convert number to string the C++ way
+{
+ std::basic_ostringstream<C> ss;
+ ss << number;
+ return ss.str();
+}
+
+template <class T, class C>
+inline
+T stringToNumber(const std::basic_string<C>& str) //convert string to number the C++ way
+{
+ T number = 0;
+ std::basic_istringstream<C>(str) >> number;
+ return number;
+}
+
+
+class LngParser
+{
+public:
+ LngParser(const std::string& fileStream) : scn(fileStream), tk(scn.nextToken()) {}
+
+ void parse(TranslationMap& out, TranslationPluralMap& pluralOut, TransHeader& header)
+ {
+ //header
+ parseHeader(header);
+
+ //items
+ while (token().type != Token::TK_END)
+ parseRegular(out, pluralOut, header.pluralCount);
+ }
+
+ void parseHeader(TransHeader& header)
+ {
+ consumeToken(Token::TK_HEADER_BEGIN);
+
+ consumeToken(Token::TK_LANG_NAME_BEGIN);
+ header.languageName = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_LANG_NAME_END);
+
+ consumeToken(Token::TK_TRANS_NAME_BEGIN);
+ header.translatorName = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_TRANS_NAME_END);
+
+ consumeToken(Token::TK_LOCALE_NAME_BEGIN);
+ header.localeName = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_LOCALE_NAME_END);
+
+ consumeToken(Token::TK_FLAG_FILE_BEGIN);
+ header.flagFile = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_FLAG_FILE_END);
+
+ consumeToken(Token::TK_PLURAL_COUNT_BEGIN);
+ header.pluralCount = stringToNumber<int>(tk.text);
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_PLURAL_COUNT_END);
+
+ consumeToken(Token::TK_PLURAL_DEF_BEGIN);
+ header.pluralDefinition = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_PLURAL_DEF_END);
+
+ consumeToken(Token::TK_HEADER_END);
+ }
+
+private:
+ void parseRegular(TranslationMap& out, TranslationPluralMap& pluralOut, int formCount)
+ {
+ consumeToken(Token::TK_SRC_BEGIN);
+
+ if (token().type == Token::TK_PLURAL_BEGIN)
+ return parsePlural(pluralOut, formCount);
+
+ std::string original = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_SRC_END);
+
+ consumeToken(Token::TK_TRG_BEGIN);
+ std::string translation;
+ if (token().type == Token::TK_TEXT)
+ {
+ translation = token().text;
+ nextToken();
+ }
+ consumeToken(Token::TK_TRG_END);
+
+ if (!translation.empty()) //only add if translation is existing
+ out.insert(std::make_pair(original, translation));
+ }
+
+ void parsePlural(TranslationPluralMap& pluralOut, int formCount)
+ {
+ //Token::TK_SRC_BEGIN already consumed
+
+ consumeToken(Token::TK_PLURAL_BEGIN);
+ std::string engSingular = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_PLURAL_END);
+
+ consumeToken(Token::TK_PLURAL_BEGIN);
+ std::string engPlural = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_PLURAL_END);
+
+ consumeToken(Token::TK_SRC_END);
+
+ consumeToken(Token::TK_TRG_BEGIN);
+
+ PluralForms pluralList;
+ while (token().type == Token::TK_PLURAL_BEGIN)
+ {
+ consumeToken(Token::TK_PLURAL_BEGIN);
+ std::string pluralForm = tk.text;
+ consumeToken(Token::TK_TEXT);
+ consumeToken(Token::TK_PLURAL_END);
+ pluralList.push_back(pluralForm);
+
+ }
+
+ if (!pluralList.empty() && static_cast<int>(pluralList.size()) != formCount) //invalid number of plural forms
+ throw ParsingError(scn.position().first, scn.position().second);
+
+ consumeToken(Token::TK_TRG_END);
+
+ if (!pluralList.empty()) //only add if translation is existing
+ pluralOut.insert(std::make_pair(SingularPluralPair(engSingular, engPlural), pluralList));
+ }
+
+
+ void nextToken() { tk = scn.nextToken(); }
+ const Token& token() const { return tk; }
+
+ void consumeToken(Token::Type t)
+ {
+ if (token().type != t)
+ throw ParsingError(scn.position().first, scn.position().second);
+ nextToken();
+ }
+
+ Scanner scn;
+ Token tk;
+};
+
+
+inline
+void parseLng(const std::string& fileStream, TransHeader& header, TranslationMap& out, TranslationPluralMap& pluralOut) //throw ParsingError
+{
+ out.clear();
+ pluralOut.clear();
+
+ //skip UTF-8 Byte Ordering Mark
+ LngParser prs(zen::startsWith(fileStream, zen::BYTE_ORDER_MARK_UTF8) ? fileStream.substr(3) : fileStream);
+ prs.parse(out, pluralOut, header);
+}
+
+
+inline
+void parseHeader(const std::string& fileStream, TransHeader& header) //throw ParsingError
+{
+ //skip UTF-8 Byte Ordering Mark
+ LngParser prs(zen::startsWith(fileStream, zen::BYTE_ORDER_MARK_UTF8) ? fileStream.substr(3) : fileStream);
+ prs.parseHeader(header);
+}
+
+
+inline
+void formatMultiLineText(std::string& text)
+{
+ if (text.find('\n') != std::string::npos) //multiple lines
+ {
+ if (*text.begin() != '\n')
+ text = '\n' + text;
+ if (*text.rbegin() != '\n')
+ text += '\n';
+ }
+}
+
+
+const std::string LB = "\n";
+const std::string TAB = "\t";
+
+
+void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream)
+{
+ //header
+ fileStream += KnownTokens::text(Token::TK_HEADER_BEGIN) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_LANG_NAME_BEGIN);
+ fileStream += header.languageName;
+ fileStream += KnownTokens::text(Token::TK_LANG_NAME_END) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_TRANS_NAME_BEGIN);
+ fileStream += header.translatorName;
+ fileStream += KnownTokens::text(Token::TK_TRANS_NAME_END) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_LOCALE_NAME_BEGIN);
+ fileStream += header.localeName;
+ fileStream += KnownTokens::text(Token::TK_LOCALE_NAME_END) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_FLAG_FILE_BEGIN);
+ fileStream += header.flagFile;
+ fileStream += KnownTokens::text(Token::TK_FLAG_FILE_END) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_PLURAL_COUNT_BEGIN);
+ fileStream += numberToString<char>(header.pluralCount);
+ fileStream += KnownTokens::text(Token::TK_PLURAL_COUNT_END) + LB;
+
+ fileStream += TAB + KnownTokens::text(Token::TK_PLURAL_DEF_BEGIN);
+ fileStream += header.pluralDefinition;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_DEF_END) + LB;
+
+ fileStream += KnownTokens::text(Token::TK_HEADER_END) + LB;
+
+ fileStream += LB;
+
+
+ //items
+ for (std::vector<TranslationList::Item*>::const_iterator i = in.sequence.begin(); i != in.sequence.end(); ++i)
+ {
+ const TranslationList::RegularItem* regular = dynamic_cast<const TranslationList::RegularItem*>(*i);
+ const TranslationList::PluralItem* plural = dynamic_cast<const TranslationList::PluralItem*>(*i);
+
+ if (regular)
+ {
+ std::string original = regular->value.first;
+ std::string translation = regular->value.second;
+
+ formatMultiLineText(original);
+ formatMultiLineText(translation);
+
+ fileStream += KnownTokens::text(Token::TK_SRC_BEGIN);
+ fileStream += original;
+ fileStream += KnownTokens::text(Token::TK_SRC_END) + LB;
+
+ fileStream += KnownTokens::text(Token::TK_TRG_BEGIN);
+ fileStream += translation;
+ fileStream += KnownTokens::text(Token::TK_TRG_END) + LB + LB;
+
+ }
+ else if (plural)
+ {
+ std::string engSingular = plural->value.first.first;
+ std::string engPlural = plural->value.first.second;
+ const PluralForms& forms = plural->value.second;
+
+ formatMultiLineText(engSingular);
+ formatMultiLineText(engPlural);
+
+ fileStream += KnownTokens::text(Token::TK_SRC_BEGIN) + LB;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN);
+ fileStream += engSingular;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN);
+ fileStream += engPlural;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB;
+ fileStream += KnownTokens::text(Token::TK_SRC_END) + LB;
+
+ fileStream += KnownTokens::text(Token::TK_TRG_BEGIN);
+ if (!forms.empty()) fileStream += LB;
+
+ for (PluralForms::const_iterator j = forms.begin(); j != forms.end(); ++j)
+ {
+ std::string plForm = *j;
+ formatMultiLineText(plForm);
+
+ fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN);
+ fileStream += plForm;
+ fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB;
+ }
+ fileStream += KnownTokens::text(Token::TK_TRG_END) + LB + LB;
+ }
+ else
+ {
+ throw std::logic_error("that's what you get for brittle design ;)");
+ }
+ }
+}
+}
+
+#endif //PARSE_LNG_HEADER_INCLUDED
bgstack15