// ************************************************************************** // * This file is part of the FreeFileSync project. It is distributed under * // * GNU General Public License: http://www.gnu.org/licenses/gpl.html * // * Copyright (C) ZenJu (zhnmju123 AT gmx DOT de) - All Rights Reserved * // ************************************************************************** #ifndef PARSE_LNG_HEADER_INCLUDED #define PARSE_LNG_HEADER_INCLUDED #include #include #include #include #include #include #include #include #include #include #include namespace lngfile { //singular forms typedef std::map TranslationMap; //orig |-> translation //plural forms typedef std::pair SingularPluralPair; //1 house| n houses typedef std::vector PluralForms; //1 dom | 2 domy | 5 domów typedef std::map TranslationPluralMap; //(sing/plu) |-> pluralforms struct TransHeader { std::string languageName; //display name: "English (UK)" std::string translatorName; //"ZenJu" std::string localeName; //ISO 639 language code + ISO 3166 country code, e.g. "en_GB", or "en_US" std::string flagFile; //"england.png" int pluralCount; //2 std::string pluralDefinition; //"n == 1 ? 0 : 1" }; struct ParsingError { ParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {} size_t row; size_t col; }; void parseLng(const std::string& fileStream, TransHeader& header, TranslationMap& out, TranslationPluralMap& pluralOut); //throw ParsingError void parseHeader(const std::string& fileStream, TransHeader& header); //throw ParsingError class TranslationList; //unordered list of unique translation items void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream); //--------------------------- implementation --------------------------- class TranslationList //unordered list of unique translation items { public: void addItem(const std::string& orig, const std::string& trans) { if (!transUnique.insert(orig).second) return; dump.push_back(RegularItem(std::make_pair(orig, trans))); sequence.push_back(&dump.back()); } void addPluralItem(const SingularPluralPair& orig, const PluralForms& trans) { if (!pluralUnique.insert(orig).second) return; dumpPlural.push_back(PluralItem(std::make_pair(orig, trans))); sequence.push_back(&dumpPlural.back()); } bool untranslatedTextExists() const { for (std::list::const_iterator i = dump.begin(); i != dump.end(); ++i) if (i->value.second.empty()) return true; for (std::list::const_iterator i = dumpPlural.begin(); i != dumpPlural.end(); ++i) if (i->value.second.empty()) return true; return false; } private: friend void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream); struct Item {virtual ~Item() {} }; struct RegularItem : public Item { RegularItem(const TranslationMap ::value_type& val) : value(val) {} TranslationMap ::value_type value; }; struct PluralItem : public Item { PluralItem (const TranslationPluralMap::value_type& val) : value(val) {} TranslationPluralMap::value_type value; }; std::vector sequence; //dynamic list of translation elements std::list dump; //manage memory std::list dumpPlural; //manage memory std::set transUnique; //check uniqueness std::set pluralUnique; // }; struct Token { enum Type { //header information TK_HEADER_BEGIN, TK_HEADER_END, TK_LANG_NAME_BEGIN, TK_LANG_NAME_END, TK_TRANS_NAME_BEGIN, TK_TRANS_NAME_END, TK_LOCALE_NAME_BEGIN, TK_LOCALE_NAME_END, TK_FLAG_FILE_BEGIN, TK_FLAG_FILE_END, TK_PLURAL_COUNT_BEGIN, TK_PLURAL_COUNT_END, TK_PLURAL_DEF_BEGIN, TK_PLURAL_DEF_END, //item level TK_SRC_BEGIN, TK_SRC_END, TK_TRG_BEGIN, TK_TRG_END, TK_TEXT, TK_PLURAL_BEGIN, TK_PLURAL_END, TK_END }; Token(Type t) : type(t) {} Type type; std::string text; }; class KnownTokens { public: typedef std::map TokenMap; static const TokenMap& asList() { static KnownTokens inst; return inst.tokens; } static std::string text(Token::Type t) { TokenMap::const_iterator iter = asList().find(t); return iter != asList().end() ? iter->second : std::string(); } private: KnownTokens() { //header information tokens.insert(std::make_pair(Token::TK_HEADER_BEGIN, "
")); tokens.insert(std::make_pair(Token::TK_HEADER_END, "
")); tokens.insert(std::make_pair(Token::TK_LANG_NAME_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_LANG_NAME_END, "")); tokens.insert(std::make_pair(Token::TK_TRANS_NAME_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_TRANS_NAME_END, "")); tokens.insert(std::make_pair(Token::TK_LOCALE_NAME_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_LOCALE_NAME_END, "")); tokens.insert(std::make_pair(Token::TK_FLAG_FILE_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_FLAG_FILE_END, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_COUNT_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_COUNT_END, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_DEF_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_DEF_END, "")); //item level tokens.insert(std::make_pair(Token::TK_SRC_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_SRC_END, "")); tokens.insert(std::make_pair(Token::TK_TRG_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_TRG_END, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_BEGIN, "")); tokens.insert(std::make_pair(Token::TK_PLURAL_END, "")); } TokenMap tokens; }; struct IsWhiteSpace : public std::unary_function { bool operator()(char c) const { const unsigned char usc = c; //caveat 1: std::isspace() takes an int, but expects an unsigned char return usc < 128 && //caveat 2: some parts of UTF-8 chars are erroneously seen as whitespace, e.g. the a0 from "\xec\x8b\a0" (MSVC) std::isspace(usc) != 0; //[!] } }; class Scanner { public: Scanner(const std::string& fileStream) : stream(fileStream), pos(stream.begin()) {} Token nextToken() { //skip whitespace pos = std::find_if(pos, stream.end(), std::not1(IsWhiteSpace())); if (pos == stream.end()) return Token(Token::TK_END); for (KnownTokens::TokenMap::const_iterator i = KnownTokens::asList().begin(); i != KnownTokens::asList().end(); ++i) if (startsWith(i->second)) { pos += i->second.size(); return Token(i->first); } //rest must be "text" std::string::const_iterator textBegin = pos; while (pos != stream.end() && !startsWithKnownTag()) pos = std::find(pos + 1, stream.end(), '<'); std::string text(textBegin, pos); normalize(text); //remove whitespace from end ect. if (text.empty() && pos == stream.end()) return Token(Token::TK_END); Token out(Token::TK_TEXT); out.text = text; return out; } std::pair position() const //current (row/col) beginning with 1 { //seek last line break std::string::const_iterator iter = pos; while (iter != stream.begin() && *iter != '\n') --iter; return std::make_pair(std::count(stream.begin(), pos, '\n') + 1, pos - iter); } private: bool startsWithKnownTag() const { for (KnownTokens::TokenMap::const_iterator i = KnownTokens::asList().begin(); i != KnownTokens::asList().end(); ++i) if (startsWith(i->second)) return true; return false; } bool startsWith(const std::string& prefix) const { if (stream.end() - pos < static_cast(prefix.size())) return false; return std::equal(prefix.begin(), prefix.end(), pos); } static void normalize(std::string& text) { //remmove whitespace from end while (!text.empty() && IsWhiteSpace()(*text.rbegin())) text.resize(text.size() - 1); //ensure c-style line breaks //Delimiter: //---------- //Linux: 0xA \n //Mac: 0xD \r //Win: 0xD 0xA \r\n <- language files are in Windows format if (text.find('\r') != std::string::npos) { std::string tmp; for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) if (*i == '\r') { std::string::const_iterator next = i + 1; if (next != text.end() && *next == '\n') ++i; tmp += '\n'; } else tmp += *i; text = tmp; } } const std::string stream; std::string::const_iterator pos; }; template inline std::basic_string numberToString(const T& number) //convert number to string the C++ way { std::basic_ostringstream ss; ss << number; return ss.str(); } template inline T stringToNumber(const std::basic_string& str) //convert string to number the C++ way { T number = 0; std::basic_istringstream(str) >> number; return number; } class LngParser { public: LngParser(const std::string& fileStream) : scn(fileStream), tk(scn.nextToken()) {} void parse(TranslationMap& out, TranslationPluralMap& pluralOut, TransHeader& header) { //header parseHeader(header); //items while (token().type != Token::TK_END) parseRegular(out, pluralOut, header.pluralCount); } void parseHeader(TransHeader& header) { consumeToken(Token::TK_HEADER_BEGIN); consumeToken(Token::TK_LANG_NAME_BEGIN); header.languageName = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_LANG_NAME_END); consumeToken(Token::TK_TRANS_NAME_BEGIN); header.translatorName = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_TRANS_NAME_END); consumeToken(Token::TK_LOCALE_NAME_BEGIN); header.localeName = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_LOCALE_NAME_END); consumeToken(Token::TK_FLAG_FILE_BEGIN); header.flagFile = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_FLAG_FILE_END); consumeToken(Token::TK_PLURAL_COUNT_BEGIN); header.pluralCount = stringToNumber(tk.text); consumeToken(Token::TK_TEXT); consumeToken(Token::TK_PLURAL_COUNT_END); consumeToken(Token::TK_PLURAL_DEF_BEGIN); header.pluralDefinition = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_PLURAL_DEF_END); consumeToken(Token::TK_HEADER_END); } private: void parseRegular(TranslationMap& out, TranslationPluralMap& pluralOut, int formCount) { consumeToken(Token::TK_SRC_BEGIN); if (token().type == Token::TK_PLURAL_BEGIN) return parsePlural(pluralOut, formCount); std::string original = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_SRC_END); consumeToken(Token::TK_TRG_BEGIN); std::string translation; if (token().type == Token::TK_TEXT) { translation = token().text; nextToken(); } consumeToken(Token::TK_TRG_END); if (!translation.empty()) //only add if translation is existing out.insert(std::make_pair(original, translation)); } void parsePlural(TranslationPluralMap& pluralOut, int formCount) { //Token::TK_SRC_BEGIN already consumed consumeToken(Token::TK_PLURAL_BEGIN); std::string engSingular = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_PLURAL_END); consumeToken(Token::TK_PLURAL_BEGIN); std::string engPlural = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_PLURAL_END); consumeToken(Token::TK_SRC_END); consumeToken(Token::TK_TRG_BEGIN); PluralForms pluralList; while (token().type == Token::TK_PLURAL_BEGIN) { consumeToken(Token::TK_PLURAL_BEGIN); std::string pluralForm = tk.text; consumeToken(Token::TK_TEXT); consumeToken(Token::TK_PLURAL_END); pluralList.push_back(pluralForm); } if (!pluralList.empty() && static_cast(pluralList.size()) != formCount) //invalid number of plural forms throw ParsingError(scn.position().first, scn.position().second); consumeToken(Token::TK_TRG_END); if (!pluralList.empty()) //only add if translation is existing pluralOut.insert(std::make_pair(SingularPluralPair(engSingular, engPlural), pluralList)); } void nextToken() { tk = scn.nextToken(); } const Token& token() const { return tk; } void consumeToken(Token::Type t) { if (token().type != t) throw ParsingError(scn.position().first, scn.position().second); nextToken(); } Scanner scn; Token tk; }; inline void parseLng(const std::string& fileStream, TransHeader& header, TranslationMap& out, TranslationPluralMap& pluralOut) //throw ParsingError { out.clear(); pluralOut.clear(); //skip UTF-8 Byte Ordering Mark LngParser prs(zen::startsWith(fileStream, zen::BYTE_ORDER_MARK_UTF8) ? fileStream.substr(3) : fileStream); prs.parse(out, pluralOut, header); } inline void parseHeader(const std::string& fileStream, TransHeader& header) //throw ParsingError { //skip UTF-8 Byte Ordering Mark LngParser prs(zen::startsWith(fileStream, zen::BYTE_ORDER_MARK_UTF8) ? fileStream.substr(3) : fileStream); prs.parseHeader(header); } inline void formatMultiLineText(std::string& text) { if (text.find('\n') != std::string::npos) //multiple lines { if (*text.begin() != '\n') text = '\n' + text; if (*text.rbegin() != '\n') text += '\n'; } } const std::string LB = "\n"; const std::string TAB = "\t"; void generateLng(const TranslationList& in, const TransHeader& header, std::string& fileStream) { //header fileStream += KnownTokens::text(Token::TK_HEADER_BEGIN) + LB; fileStream += TAB + KnownTokens::text(Token::TK_LANG_NAME_BEGIN); fileStream += header.languageName; fileStream += KnownTokens::text(Token::TK_LANG_NAME_END) + LB; fileStream += TAB + KnownTokens::text(Token::TK_TRANS_NAME_BEGIN); fileStream += header.translatorName; fileStream += KnownTokens::text(Token::TK_TRANS_NAME_END) + LB; fileStream += TAB + KnownTokens::text(Token::TK_LOCALE_NAME_BEGIN); fileStream += header.localeName; fileStream += KnownTokens::text(Token::TK_LOCALE_NAME_END) + LB; fileStream += TAB + KnownTokens::text(Token::TK_FLAG_FILE_BEGIN); fileStream += header.flagFile; fileStream += KnownTokens::text(Token::TK_FLAG_FILE_END) + LB; fileStream += TAB + KnownTokens::text(Token::TK_PLURAL_COUNT_BEGIN); fileStream += numberToString(header.pluralCount); fileStream += KnownTokens::text(Token::TK_PLURAL_COUNT_END) + LB; fileStream += TAB + KnownTokens::text(Token::TK_PLURAL_DEF_BEGIN); fileStream += header.pluralDefinition; fileStream += KnownTokens::text(Token::TK_PLURAL_DEF_END) + LB; fileStream += KnownTokens::text(Token::TK_HEADER_END) + LB; fileStream += LB; //items for (std::vector::const_iterator i = in.sequence.begin(); i != in.sequence.end(); ++i) { const TranslationList::RegularItem* regular = dynamic_cast(*i); const TranslationList::PluralItem* plural = dynamic_cast(*i); if (regular) { std::string original = regular->value.first; std::string translation = regular->value.second; formatMultiLineText(original); formatMultiLineText(translation); fileStream += KnownTokens::text(Token::TK_SRC_BEGIN); fileStream += original; fileStream += KnownTokens::text(Token::TK_SRC_END) + LB; fileStream += KnownTokens::text(Token::TK_TRG_BEGIN); fileStream += translation; fileStream += KnownTokens::text(Token::TK_TRG_END) + LB + LB; } else if (plural) { std::string engSingular = plural->value.first.first; std::string engPlural = plural->value.first.second; const PluralForms& forms = plural->value.second; formatMultiLineText(engSingular); formatMultiLineText(engPlural); fileStream += KnownTokens::text(Token::TK_SRC_BEGIN) + LB; fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN); fileStream += engSingular; fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB; fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN); fileStream += engPlural; fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB; fileStream += KnownTokens::text(Token::TK_SRC_END) + LB; fileStream += KnownTokens::text(Token::TK_TRG_BEGIN); if (!forms.empty()) fileStream += LB; for (PluralForms::const_iterator j = forms.begin(); j != forms.end(); ++j) { std::string plForm = *j; formatMultiLineText(plForm); fileStream += KnownTokens::text(Token::TK_PLURAL_BEGIN); fileStream += plForm; fileStream += KnownTokens::text(Token::TK_PLURAL_END) + LB; } fileStream += KnownTokens::text(Token::TK_TRG_END) + LB + LB; } else { throw std::logic_error("that's what you get for brittle design ;)"); } } } } #endif //PARSE_LNG_HEADER_INCLUDED