summaryrefslogtreecommitdiff
path: root/zen/utf.h
diff options
context:
space:
mode:
authorDaniel Wilhelm <daniel@wili.li>2014-04-18 17:24:35 +0200
committerDaniel Wilhelm <daniel@wili.li>2014-04-18 17:24:35 +0200
commit460091fb0b2ff114cc741372f15bb43b702ea3b1 (patch)
tree0562c2eda4c66969c6e6d0910080db9f5b0def3e /zen/utf.h
parent5.15 (diff)
downloadFreeFileSync-460091fb0b2ff114cc741372f15bb43b702ea3b1.tar.gz
FreeFileSync-460091fb0b2ff114cc741372f15bb43b702ea3b1.tar.bz2
FreeFileSync-460091fb0b2ff114cc741372f15bb43b702ea3b1.zip
5.16
Diffstat (limited to 'zen/utf.h')
-rw-r--r--zen/utf.h199
1 files changed, 99 insertions, 100 deletions
diff --git a/zen/utf.h b/zen/utf.h
index 27804a21..8da588cd 100644
--- a/zen/utf.h
+++ b/zen/utf.h
@@ -50,18 +50,6 @@ size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return positio
-
-
-
-
-
-
-
-
-
-
-
-
//----------------------- implementation ----------------------------------
namespace implementation
{
@@ -69,45 +57,47 @@ typedef std::uint_fast32_t CodePoint; //must be at least four bytes
typedef std::uint_fast16_t Char16; //we need an unsigned type
typedef unsigned char Char8;
-const CodePoint CODE_POINT_MAX = 0x10ffff;
+const CodePoint LEAD_SURROGATE = 0xd800;
+const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1
+const CodePoint TRAIL_SURROGATE_MAX = 0xdfff;
-const CodePoint HIGH_SURROGATE = 0xd800;
-const CodePoint HIGH_SURROGATE_MAX = 0xdbff;
-
-const CodePoint LOW_SURROGATE = 0xdc00;
-const CodePoint LOW_SURROGATE_MAX = 0xdfff;
+const CodePoint REPLACEMENT_CHAR = 0xfffd;
+const CodePoint CODE_POINT_MAX = 0x10ffff;
template <class Function> inline
void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
{
//http://en.wikipedia.org/wiki/UTF-16
- assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
- assert(cp <= CODE_POINT_MAX);
- if (cp < 0x10000)
+ if (cp < LEAD_SURROGATE)
writeOutput(static_cast<Char16>(cp));
- else
+ else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
+ codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
+ else if (cp < 0x10000)
+ writeOutput(static_cast<Char16>(cp));
+ else if (cp <= CODE_POINT_MAX)
{
cp -= 0x10000;
- writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE));
- writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE));
+ writeOutput(LEAD_SURROGATE + static_cast<Char16>(cp >> 10));
+ writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
}
+ else //invalid code point
+ codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
}
inline
-size_t getUtf16Len(Char16 ch) //ch must be first code unit!
+size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
{
- const CodePoint cp = ch;
-
- if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
+ if (ch < LEAD_SURROGATE)
+ return 1;
+ else if (ch < TRAIL_SURROGATE)
return 2;
+ else if (ch <= TRAIL_SURROGATE_MAX)
+ return 0; //unexpected trail surrogate!
else
- {
- assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
return 1;
- }
}
@@ -119,19 +109,27 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu
for ( ; first != last; ++first)
{
CodePoint cp = static_cast<Char16>(*first);
- if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
+ switch (getUtf16Len(static_cast<Char16>(cp)))
{
- if (++first == last)
- {
- assert(false); //low surrogate expected
- return;
- }
- assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
- cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
+ case 0: //invalid utf16 character
+ cp = REPLACEMENT_CHAR;
+ break;
+ case 1:
+ break;
+ case 2:
+ if (++first != last) //trail surrogate expected!
+ {
+ const Char16 ch = static_cast<Char16>(*first);
+ if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
+ {
+ cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
+ break;
+ }
+ }
+ --first;
+ cp = REPLACEMENT_CHAR;
+ break;
}
- else
- assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
-
writeOutput(cp);
}
}
@@ -141,6 +139,7 @@ template <class Function> inline
void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
{
//http://en.wikipedia.org/wiki/UTF-8
+ //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8
if (cp < 0x80)
writeOutput(static_cast<Char8>(cp));
@@ -151,23 +150,24 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un
}
else if (cp < 0x10000)
{
- writeOutput(static_cast<Char8>((cp >> 12 ) | 0xe0));
+ writeOutput(static_cast<Char8>( (cp >> 12 ) | 0xe0));
writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
- writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80));
+ writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80));
}
- else
+ else if (cp <= CODE_POINT_MAX)
{
- assert(cp <= CODE_POINT_MAX);
- writeOutput(static_cast<Char8>((cp >> 18 ) | 0xf0));
+ writeOutput(static_cast<Char8>( (cp >> 18 ) | 0xf0));
writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
- writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80));
+ writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80));
}
+ else //invalid code point
+ codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
}
inline
-size_t getUtf8Len(unsigned char ch) //ch must be first code unit!
+size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
{
if (ch < 0x80)
return 1;
@@ -177,12 +177,27 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit!
return 3;
if (ch >> 3 == 0x1e)
return 4;
-
- assert(false); //no valid begin of UTF8 encoding
- return 1;
+ return 0; //innvalid begin of UTF8 encoding
}
+template <class CharIterator> inline
+bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
+{
+ if (++first != last) //trail surrogate expected!
+ {
+ const Char8 ch = static_cast<Char8>(*first);
+ if (ch >> 6 == 0x2) //trail surrogate expected!
+ {
+ cp = (cp << 6) + (ch & 0x3f);
+ return true;
+ }
+ }
+ --first;
+ cp = REPLACEMENT_CHAR;
+ return false;
+}
+
template <class CharIterator, class Function> inline
void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
{
@@ -190,57 +205,32 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput
for ( ; first != last; ++first)
{
- auto getChar = [&](Char8& ch) -> bool
- {
- if (++first == last)
- {
- assert(false); //low surrogate expected
- return false;
- }
- ch = static_cast<Char8>(*first);
- assert(ch >> 6 == 0x2);
- return true;
- };
-
- Char8 ch = static_cast<Char8>(*first);
- switch (getUtf8Len(ch))
+ CodePoint cp = static_cast<Char8>(*first);
+ switch (getUtf8Len(static_cast<Char8>(cp)))
{
+ case 0: //invalid utf8 character
+ cp = REPLACEMENT_CHAR;
+ break;
case 1:
- writeOutput(ch);
break;
case 2:
- {
- CodePoint cp = (ch & 0x1f) << 6;
- if (!getChar(ch)) return;
- cp += ch & 0x3f;
- writeOutput(cp);
- }
- break;
+ cp &= 0x1f;
+ decodeTrail(first, last, cp);
+ break;
case 3:
- {
- CodePoint cp = (ch & 0xf) << 12;
- if (!getChar(ch)) return;
- cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) return;
- cp += ch & 0x3f;
- writeOutput(cp);
- }
- break;
+ cp &= 0xf;
+ if (decodeTrail(first, last, cp))
+ decodeTrail(first, last, cp);
+ break;
case 4:
- {
- CodePoint cp = (ch & 0x7) << 18;
- if (!getChar(ch)) return;
- cp += (ch & 0x3f) << 12;
- if (!getChar(ch)) return;
- cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) return;
- cp += ch & 0x3f;
- writeOutput(cp);
- }
- break;
- default:
- assert(false);
+ cp &= 0x7;
+ if (decodeTrail(first, last, cp))
+ if (decodeTrail(first, last, cp))
+ decodeTrail(first, last, cp);
+ if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
+ break;
}
+ writeOutput(cp);
}
}
@@ -257,7 +247,10 @@ size_t unicodeLength(const CharString& str, char) //utf8
while (strFirst < strLast) //[!]
{
++len;
- strFirst += getUtf8Len(*strFirst); //[!]
+
+ size_t utf8len = getUtf8Len(*strFirst);
+ if (utf8len == 0) ++utf8len; //invalid utf8 character
+ strFirst += utf8len;
}
return len;
}
@@ -275,7 +268,9 @@ size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wc
while (strFirst < strLast) //[!]
{
++len;
- strFirst += getUtf16Len(*strFirst); //[!]
+ size_t utf16len = getUtf16Len(*strFirst);
+ if (utf16len == 0) ++utf16len; //invalid utf16 character
+ strFirst += utf16len;
}
return len;
}
@@ -316,7 +311,9 @@ size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-cha
size_t utfPos = 0;
while (unicodePos-- > 0)
{
- utfPos += getUtf8Len(strFirst[utfPos]);
+ size_t utf8len = getUtf8Len(strFirst[utfPos]);
+ if (utf8len == 0) ++utf8len; //invalid utf8 character
+ utfPos += utf8len;
if (utfPos >= strLen)
return strLen;
@@ -336,7 +333,9 @@ size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>)
size_t utfPos = 0;
while (unicodePos-- > 0)
{
- utfPos += getUtf16Len(strFirst[utfPos]);
+ size_t utf16len = getUtf16Len(strFirst[utfPos]);
+ if (utf16len == 0) ++utf16len; //invalid utf16 character
+ utfPos += utf16len;
if (utfPos >= strLen)
return strLen;
bgstack15