8.9

author: Daniel Wilhelm <shieldwed@outlook.com> 2017-02-13 21:25:04 -0700
committer: Daniel Wilhelm <shieldwed@outlook.com> 2017-02-13 21:25:04 -0700
commit: 9d071d2a2cec9a7662a02669488569a017f0ea35 (patch)
tree: c83a623fbdff098339b66d21ea2e81f3f67344ae /zen/utf.h
parent: 8.8 (diff)
download: FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.tar.gz
FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.tar.bz2
FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.zip
1 files changed, 476 insertions, 476 deletions
diff --git a/zen/utf.h b/zen/utf.h
index 1544c9ab..41fdf58c 100644..100755
--- a/zen/utf.h
+++ b/zen/utf.h
@@ -1,476 +1,476 @@
-// *****************************************************************************
-// * This file is part of the FreeFileSync project. It is distributed under    *
-// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0           *
-// * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
-// *****************************************************************************
-
-#ifndef UTF_H_01832479146991573473545
-#define UTF_H_01832479146991573473545
-
-#include <cstdint>
-#include <iterator>
-#include "string_tools.h" //copyStringTo
-
-namespace zen
-{
-//convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!)
-template <class TargetString, class SourceString>
-TargetString utfCvrtTo(const SourceString& str);
-
-const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
-
-template <class CharString>
-bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors
-
-//---- explicit conversion: wide <-> utf8 ----
-template <class CharString, class WideString>
-CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8<std::string>(L"abc");
-
-template <class WideString, class CharString>
-WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide<std::wstring>("abc");
-
-//access unicode characters in UTF-encoded string (char- or wchar_t-based)
-template <class UtfString>
-size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string
-
-template <class UtfString>
-size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//----------------------- implementation ----------------------------------
-namespace implementation
-{
-using CodePoint = std::uint32_t;
-using Char16    = std::uint16_t;
-using Char8     = unsigned char;
-
-const CodePoint LEAD_SURROGATE      = 0xd800;
-const CodePoint TRAIL_SURROGATE     = 0xdc00; //== LEAD_SURROGATE_MAX + 1
-const CodePoint TRAIL_SURROGATE_MAX = 0xdfff;
-
-const CodePoint REPLACEMENT_CHAR    = 0xfffd;
-const CodePoint CODE_POINT_MAX      = 0x10ffff;
-
-
-template <class Function> inline
-void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
-{
-    //http://en.wikipedia.org/wiki/UTF-16
-
-    if (cp < LEAD_SURROGATE)
-        writeOutput(static_cast<Char16>(cp));
-    else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
-        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
-    else if (cp < 0x10000)
-        writeOutput(static_cast<Char16>(cp));
-    else if (cp <= CODE_POINT_MAX)
-    {
-        cp -= 0x10000;
-        writeOutput(LEAD_SURROGATE  + static_cast<Char16>(cp >> 10));
-        writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
-    }
-    else //invalid code point
-        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
-}
-
-
-inline
-size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
-{
-    if (ch < LEAD_SURROGATE)
-        return 1;
-    else if (ch < TRAIL_SURROGATE)
-        return 2;
-    else if (ch <= TRAIL_SURROGATE_MAX)
-        return 0; //unexpected trail surrogate!
-    else
-        return 1;
-}
-
-
-template <class CharIterator, class Function> inline
-void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
-{
-    static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2, "");
-
-    for ( ; first != last; ++first)
-    {
-        CodePoint cp = static_cast<Char16>(*first);
-        switch (getUtf16Len(static_cast<Char16>(cp)))
-        {
-            case 0: //invalid utf16 character
-                cp = REPLACEMENT_CHAR;
-                break;
-            case 1:
-                break;
-            case 2:
-                if (++first != last) //trail surrogate expected!
-                {
-                    const Char16 ch = static_cast<Char16>(*first);
-                    if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
-                    {
-                        cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
-                        break;
-                    }
-                }
-                --first;
-                cp = REPLACEMENT_CHAR;
-                break;
-        }
-        writeOutput(cp);
-    }
-}
-
-
-template <class Function> inline
-void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
-{
-    //http://en.wikipedia.org/wiki/UTF-8
-    //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8
-
-    if (cp < 0x80)
-        writeOutput(static_cast<Char8>(cp));
-    else if (cp < 0x800)
-    {
-        writeOutput(static_cast<Char8>((cp >> 6  ) | 0xc0));
-        writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80));
-    }
-    else if (cp < 0x10000)
-    {
-        writeOutput(static_cast<Char8>( (cp >> 12       ) | 0xe0));
-        writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
-        writeOutput(static_cast<Char8>( (cp & 0x3f      ) | 0x80));
-    }
-    else if (cp <= CODE_POINT_MAX)
-    {
-        writeOutput(static_cast<Char8>( (cp >> 18        ) | 0xf0));
-        writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
-        writeOutput(static_cast<Char8>(((cp >> 6)  & 0x3f) | 0x80));
-        writeOutput(static_cast<Char8>( (cp & 0x3f       ) | 0x80));
-    }
-    else //invalid code point
-        codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
-}
-
-
-inline
-size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
-{
-    if (ch < 0x80)
-        return 1;
-    if (ch >> 5 == 0x6)
-        return 2;
-    if (ch >> 4 == 0xe)
-        return 3;
-    if (ch >> 3 == 0x1e)
-        return 4;
-    return 0; //innvalid begin of UTF8 encoding
-}
-
-
-template <class CharIterator> inline
-bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
-{
-    if (++first != last) //trail surrogate expected!
-    {
-        const Char8 ch = static_cast<Char8>(*first);
-        if (ch >> 6 == 0x2) //trail surrogate expected!
-        {
-            cp = (cp << 6) + (ch & 0x3f);
-            return true;
-        }
-    }
-    --first;
-    cp = REPLACEMENT_CHAR;
-    return false;
-}
-
-template <class CharIterator, class Function> inline
-void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
-{
-    static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1, "");
-
-    for ( ; first != last; ++first)
-    {
-        CodePoint cp = static_cast<Char8>(*first);
-        switch (getUtf8Len(static_cast<Char8>(cp)))
-        {
-            case 0: //invalid utf8 character
-                cp = REPLACEMENT_CHAR;
-                break;
-            case 1:
-                break;
-            case 2:
-                cp &= 0x1f;
-                decodeTrail(first, last, cp);
-                break;
-            case 3:
-                cp &= 0xf;
-                if (decodeTrail(first, last, cp))
-                    decodeTrail(first, last, cp);
-                break;
-            case 4:
-                cp &= 0x7;
-                if (decodeTrail(first, last, cp))
-                    if (decodeTrail(first, last, cp))
-                        decodeTrail(first, last, cp);
-                if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
-                break;
-        }
-        writeOutput(cp);
-    }
-}
-
-
-template <class CharString> inline
-size_t unicodeLength(const CharString& str, char) //utf8
-{
-    using CharType = typename GetCharType<CharString>::Type;
-
-    const CharType*       strFirst  = strBegin(str);
-    const CharType* const strLast   = strFirst + strLength(str);
-
-    size_t len = 0;
-    while (strFirst < strLast) //[!]
-    {
-        ++len;
-        size_t utf8len = getUtf8Len(*strFirst);
-        if (utf8len == 0) ++utf8len; //invalid utf8 character
-        strFirst += utf8len;
-    }
-    return len;
-}
-
-
-template <class WideString> inline
-size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t
-{
-    using CharType = typename GetCharType<WideString>::Type;
-
-    const CharType*       strFirst = strBegin(str);
-    const CharType* const strLast  = strFirst + strLength(str);
-
-    size_t len = 0;
-    while (strFirst < strLast) //[!]
-    {
-        ++len;
-        size_t utf16len = getUtf16Len(*strFirst);
-        if (utf16len == 0) ++utf16len; //invalid utf16 character
-        strFirst += utf16len;
-    }
-    return len;
-}
-
-
-template <class WideString> inline
-size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t
-{
-    return strLength(str);
-}
-
-
-template <class WideString> inline
-size_t unicodeLength(const WideString& str, wchar_t)
-{
-    return unicodeLengthWide(str, Int2Type<sizeof(wchar_t)>());
-}
-}
-
-
-template <class UtfString> inline
-size_t unicodeLength(const UtfString& str) //return number of code points
-{
-    return implementation::unicodeLength(str, typename GetCharType<UtfString>::Type());
-}
-
-
-namespace implementation
-{
-template <class CharString> inline
-size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char
-{
-    using CharType = typename GetCharType<CharString>::Type;
-
-    const CharType* strFirst = strBegin(str);
-    const size_t strLen = strLength(str);
-
-    size_t utfPos = 0;
-    while (unicodePos-- > 0)
-    {
-        if (utfPos >= strLen)
-            return strLen;
-
-        size_t utf8len = getUtf8Len(strFirst[utfPos]);
-        if (utf8len == 0) ++utf8len; //invalid utf8 character
-        utfPos += utf8len;
-    }
-    if (utfPos >= strLen)
-        return strLen;
-    return utfPos;
-}
-
-
-template <class WideString> inline
-size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t
-{
-    using CharType = typename GetCharType<WideString>::Type;
-
-    const CharType* strFirst = strBegin(str);
-    const size_t strLen = strLength(str);
-
-    size_t utfPos = 0;
-    while (unicodePos-- > 0)
-    {
-        if (utfPos >= strLen)
-            return strLen;
-
-        size_t utf16len = getUtf16Len(strFirst[utfPos]);
-        if (utf16len == 0) ++utf16len; //invalid utf16 character
-        utfPos += utf16len;
-    }
-    if (utfPos >= strLen)
-        return strLen;
-    return utfPos;
-}
-
-
-template <class WideString> inline
-size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t
-{
-    return std::min(strLength(str), unicodePos);
-}
-
-
-template <class UtfString> inline
-size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t)
-{
-    return findUnicodePosWide(str, unicodePos, Int2Type<sizeof(wchar_t)>());
-}
-}
-
-
-template <class UtfString> inline
-size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string
-{
-    return implementation::findUnicodePos(str, unicodePos, typename GetCharType<UtfString>::Type());
-}
-
-//-------------------------------------------------------------------------------------------
-
-namespace implementation
-{
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t
-{
-    WideString output;
-    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
-    [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); });
-    return output;
-}
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t
-{
-    WideString output;
-    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
-    [&](CodePoint cp) { output += static_cast<wchar_t>(cp); });
-    return output;
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8
-{
-    CharString output;
-    utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
-    [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
-    return output;
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
-{
-    CharString output;
-    std::for_each(strBegin(str), strBegin(str) + strLength(str),
-    [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
-    return output;
-}
-}
-
-
-template <class CharString> inline
-bool isValidUtf8(const CharString& str)
-{
-    using namespace implementation;
-    bool valid = true;
-    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
-                    [&](CodePoint cp)
-    {
-        if (cp == REPLACEMENT_CHAR)
-            valid = false; //perf: should we use an (expensive) exception for iteration break?
-    });
-    return valid;
-}
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str)
-{
-    static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
-    static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
-
-    return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str)
-{
-    static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
-    static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
-
-    return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
-}
-
-//-------------------------------------------------------------------------------------------
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utfCvrtTo(const SourceString& str)
-{
-    return utfCvrtTo<TargetString>(str,
-                                   typename GetCharType<SourceString>::Type(),
-                                   typename GetCharType<TargetString>::Type());
-}
-}
-
-#endif //UTF_H_01832479146991573473545
+// *****************************************************************************
+// * This file is part of the FreeFileSync project. It is distributed under    *
+// * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0           *
+// * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
+// *****************************************************************************
+
+#ifndef UTF_H_01832479146991573473545
+#define UTF_H_01832479146991573473545
+
+#include <cstdint>
+#include <iterator>
+#include "string_tools.h" //copyStringTo
+
+namespace zen
+{
+//convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!)
+template <class TargetString, class SourceString>
+TargetString utfCvrtTo(const SourceString& str);
+
+const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
+
+template <class CharString>
+bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors
+
+//---- explicit conversion: wide <-> utf8 ----
+template <class CharString, class WideString>
+CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8<std::string>(L"abc");
+
+template <class WideString, class CharString>
+WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide<std::wstring>("abc");
+
+//access unicode characters in UTF-encoded string (char- or wchar_t-based)
+template <class UtfString>
+size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string
+
+template <class UtfString>
+size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//----------------------- implementation ----------------------------------
+namespace implementation
+{
+using CodePoint = uint32_t;
+using Char16    = uint16_t;
+using Char8     = unsigned char;
+
+const CodePoint LEAD_SURROGATE      = 0xd800;
+const CodePoint TRAIL_SURROGATE     = 0xdc00; //== LEAD_SURROGATE_MAX + 1
+const CodePoint TRAIL_SURROGATE_MAX = 0xdfff;
+
+const CodePoint REPLACEMENT_CHAR    = 0xfffd;
+const CodePoint CODE_POINT_MAX      = 0x10ffff;
+
+
+template <class Function> inline
+void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
+{
+    //http://en.wikipedia.org/wiki/UTF-16
+
+    if (cp < LEAD_SURROGATE)
+        writeOutput(static_cast<Char16>(cp));
+    else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
+        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
+    else if (cp < 0x10000)
+        writeOutput(static_cast<Char16>(cp));
+    else if (cp <= CODE_POINT_MAX)
+    {
+        cp -= 0x10000;
+        writeOutput(LEAD_SURROGATE  + static_cast<Char16>(cp >> 10));
+        writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
+    }
+    else //invalid code point
+        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
+}
+
+
+inline
+size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
+{
+    if (ch < LEAD_SURROGATE)
+        return 1;
+    else if (ch < TRAIL_SURROGATE)
+        return 2;
+    else if (ch <= TRAIL_SURROGATE_MAX)
+        return 0; //unexpected trail surrogate!
+    else
+        return 1;
+}
+
+
+template <class CharIterator, class Function> inline
+void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
+{
+    static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2, "");
+
+    for ( ; first != last; ++first)
+    {
+        CodePoint cp = static_cast<Char16>(*first);
+        switch (getUtf16Len(static_cast<Char16>(cp)))
+        {
+            case 0: //invalid utf16 character
+                cp = REPLACEMENT_CHAR;
+                break;
+            case 1:
+                break;
+            case 2:
+                if (++first != last) //trail surrogate expected!
+                {
+                    const Char16 ch = static_cast<Char16>(*first);
+                    if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
+                    {
+                        cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
+                        break;
+                    }
+                }
+                --first;
+                cp = REPLACEMENT_CHAR;
+                break;
+        }
+        writeOutput(cp);
+    }
+}
+
+
+template <class Function> inline
+void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
+{
+    //http://en.wikipedia.org/wiki/UTF-8
+    //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8
+
+    if (cp < 0x80)
+        writeOutput(static_cast<Char8>(cp));
+    else if (cp < 0x800)
+    {
+        writeOutput(static_cast<Char8>((cp >> 6  ) | 0xc0));
+        writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80));
+    }
+    else if (cp < 0x10000)
+    {
+        writeOutput(static_cast<Char8>( (cp >> 12       ) | 0xe0));
+        writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
+        writeOutput(static_cast<Char8>( (cp & 0x3f      ) | 0x80));
+    }
+    else if (cp <= CODE_POINT_MAX)
+    {
+        writeOutput(static_cast<Char8>( (cp >> 18        ) | 0xf0));
+        writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
+        writeOutput(static_cast<Char8>(((cp >> 6)  & 0x3f) | 0x80));
+        writeOutput(static_cast<Char8>( (cp & 0x3f       ) | 0x80));
+    }
+    else //invalid code point
+        codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
+}
+
+
+inline
+size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
+{
+    if (ch < 0x80)
+        return 1;
+    if (ch >> 5 == 0x6)
+        return 2;
+    if (ch >> 4 == 0xe)
+        return 3;
+    if (ch >> 3 == 0x1e)
+        return 4;
+    return 0; //innvalid begin of UTF8 encoding
+}
+
+
+template <class CharIterator> inline
+bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
+{
+    if (++first != last) //trail surrogate expected!
+    {
+        const Char8 ch = static_cast<Char8>(*first);
+        if (ch >> 6 == 0x2) //trail surrogate expected!
+        {
+            cp = (cp << 6) + (ch & 0x3f);
+            return true;
+        }
+    }
+    --first;
+    cp = REPLACEMENT_CHAR;
+    return false;
+}
+
+template <class CharIterator, class Function> inline
+void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
+{
+    static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1, "");
+
+    for ( ; first != last; ++first)
+    {
+        CodePoint cp = static_cast<Char8>(*first);
+        switch (getUtf8Len(static_cast<Char8>(cp)))
+        {
+            case 0: //invalid utf8 character
+                cp = REPLACEMENT_CHAR;
+                break;
+            case 1:
+                break;
+            case 2:
+                cp &= 0x1f;
+                decodeTrail(first, last, cp);
+                break;
+            case 3:
+                cp &= 0xf;
+                if (decodeTrail(first, last, cp))
+                    decodeTrail(first, last, cp);
+                break;
+            case 4:
+                cp &= 0x7;
+                if (decodeTrail(first, last, cp))
+                    if (decodeTrail(first, last, cp))
+                        decodeTrail(first, last, cp);
+                if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
+                break;
+        }
+        writeOutput(cp);
+    }
+}
+
+
+template <class CharString> inline
+size_t unicodeLength(const CharString& str, char) //utf8
+{
+    using CharType = typename GetCharType<CharString>::Type;
+
+    const CharType*       strFirst  = strBegin(str);
+    const CharType* const strLast   = strFirst + strLength(str);
+
+    size_t len = 0;
+    while (strFirst < strLast) //[!]
+    {
+        ++len;
+        size_t utf8len = getUtf8Len(*strFirst);
+        if (utf8len == 0) ++utf8len; //invalid utf8 character
+        strFirst += utf8len;
+    }
+    return len;
+}
+
+
+template <class WideString> inline
+size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t
+{
+    using CharType = typename GetCharType<WideString>::Type;
+
+    const CharType*       strFirst = strBegin(str);
+    const CharType* const strLast  = strFirst + strLength(str);
+
+    size_t len = 0;
+    while (strFirst < strLast) //[!]
+    {
+        ++len;
+        size_t utf16len = getUtf16Len(*strFirst);
+        if (utf16len == 0) ++utf16len; //invalid utf16 character
+        strFirst += utf16len;
+    }
+    return len;
+}
+
+
+template <class WideString> inline
+size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t
+{
+    return strLength(str);
+}
+
+
+template <class WideString> inline
+size_t unicodeLength(const WideString& str, wchar_t)
+{
+    return unicodeLengthWide(str, Int2Type<sizeof(wchar_t)>());
+}
+}
+
+
+template <class UtfString> inline
+size_t unicodeLength(const UtfString& str) //return number of code points
+{
+    return implementation::unicodeLength(str, typename GetCharType<UtfString>::Type());
+}
+
+
+namespace implementation
+{
+template <class CharString> inline
+size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char
+{
+    using CharType = typename GetCharType<CharString>::Type;
+
+    const CharType* strFirst = strBegin(str);
+    const size_t strLen = strLength(str);
+
+    size_t utfPos = 0;
+    while (unicodePos-- > 0)
+    {
+        if (utfPos >= strLen)
+            return strLen;
+
+        size_t utf8len = getUtf8Len(strFirst[utfPos]);
+        if (utf8len == 0) ++utf8len; //invalid utf8 character
+        utfPos += utf8len;
+    }
+    if (utfPos >= strLen)
+        return strLen;
+    return utfPos;
+}
+
+
+template <class WideString> inline
+size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t
+{
+    using CharType = typename GetCharType<WideString>::Type;
+
+    const CharType* strFirst = strBegin(str);
+    const size_t strLen = strLength(str);
+
+    size_t utfPos = 0;
+    while (unicodePos-- > 0)
+    {
+        if (utfPos >= strLen)
+            return strLen;
+
+        size_t utf16len = getUtf16Len(strFirst[utfPos]);
+        if (utf16len == 0) ++utf16len; //invalid utf16 character
+        utfPos += utf16len;
+    }
+    if (utfPos >= strLen)
+        return strLen;
+    return utfPos;
+}
+
+
+template <class WideString> inline
+size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t
+{
+    return std::min(strLength(str), unicodePos);
+}
+
+
+template <class UtfString> inline
+size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t)
+{
+    return findUnicodePosWide(str, unicodePos, Int2Type<sizeof(wchar_t)>());
+}
+}
+
+
+template <class UtfString> inline
+size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string
+{
+    return implementation::findUnicodePos(str, unicodePos, typename GetCharType<UtfString>::Type());
+}
+
+//-------------------------------------------------------------------------------------------
+
+namespace implementation
+{
+template <class WideString, class CharString> inline
+WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t
+{
+    WideString output;
+    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
+    [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); });
+    return output;
+}
+
+
+template <class WideString, class CharString> inline
+WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t
+{
+    WideString output;
+    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
+    [&](CodePoint cp) { output += static_cast<wchar_t>(cp); });
+    return output;
+}
+
+
+template <class CharString, class WideString> inline
+CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8
+{
+    CharString output;
+    utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
+    [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
+    return output;
+}
+
+
+template <class CharString, class WideString> inline
+CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
+{
+    CharString output;
+    std::for_each(strBegin(str), strBegin(str) + strLength(str),
+    [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
+    return output;
+}
+}
+
+
+template <class CharString> inline
+bool isValidUtf8(const CharString& str)
+{
+    using namespace implementation;
+    bool valid = true;
+    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
+                    [&](CodePoint cp)
+    {
+        if (cp == REPLACEMENT_CHAR)
+            valid = false; //perf: should we use an (expensive) exception for iteration break?
+    });
+    return valid;
+}
+
+
+template <class WideString, class CharString> inline
+WideString utf8ToWide(const CharString& str)
+{
+    static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
+    static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
+
+    return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
+}
+
+
+template <class CharString, class WideString> inline
+CharString wideToUtf8(const WideString& str)
+{
+    static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
+    static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
+
+    return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
+}
+
+//-------------------------------------------------------------------------------------------
+
+template <class TargetString, class SourceString> inline
+TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }
+
+template <class TargetString, class SourceString> inline
+TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }
+
+template <class TargetString, class SourceString> inline
+TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); }
+
+template <class TargetString, class SourceString> inline
+TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); }
+
+template <class TargetString, class SourceString> inline
+TargetString utfCvrtTo(const SourceString& str)
+{
+    return utfCvrtTo<TargetString>(str,
+                                   typename GetCharType<SourceString>::Type(),
+                                   typename GetCharType<TargetString>::Type());
+}
+}
+
+#endif //UTF_H_01832479146991573473545
author	Daniel Wilhelm <shieldwed@outlook.com>	2017-02-13 21:25:04 -0700
committer	Daniel Wilhelm <shieldwed@outlook.com>	2017-02-13 21:25:04 -0700
commit	9d071d2a2cec9a7662a02669488569a017f0ea35 (patch)
tree	c83a623fbdff098339b66d21ea2e81f3f67344ae /zen/utf.h
parent	8.8 (diff)
download	FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.tar.gz FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.tar.bz2 FreeFileSync-9d071d2a2cec9a7662a02669488569a017f0ea35.zip