diff options
Diffstat (limited to 'zen/utf.h')
-rw-r--r-- | zen/utf.h | 199 |
1 files changed, 99 insertions, 100 deletions
@@ -50,18 +50,6 @@ size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return positio - - - - - - - - - - - - //----------------------- implementation ---------------------------------- namespace implementation { @@ -69,45 +57,47 @@ typedef std::uint_fast32_t CodePoint; //must be at least four bytes typedef std::uint_fast16_t Char16; //we need an unsigned type typedef unsigned char Char8; -const CodePoint CODE_POINT_MAX = 0x10ffff; +const CodePoint LEAD_SURROGATE = 0xd800; +const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1 +const CodePoint TRAIL_SURROGATE_MAX = 0xdfff; -const CodePoint HIGH_SURROGATE = 0xd800; -const CodePoint HIGH_SURROGATE_MAX = 0xdbff; - -const CodePoint LOW_SURROGATE = 0xdc00; -const CodePoint LOW_SURROGATE_MAX = 0xdfff; +const CodePoint REPLACEMENT_CHAR = 0xfffd; +const CodePoint CODE_POINT_MAX = 0x10ffff; template <class Function> inline void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { //http://en.wikipedia.org/wiki/UTF-16 - assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16 - assert(cp <= CODE_POINT_MAX); - if (cp < 0x10000) + if (cp < LEAD_SURROGATE) writeOutput(static_cast<Char16>(cp)); - else + else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point + codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 + else if (cp < 0x10000) + writeOutput(static_cast<Char16>(cp)); + else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; - writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE)); - writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE)); + writeOutput(LEAD_SURROGATE + static_cast<Char16>(cp >> 10)); + writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff)); } + else //invalid code point + codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 } inline -size_t getUtf16Len(Char16 ch) //ch must be first code unit! +size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error! { - const CodePoint cp = ch; - - if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX) + if (ch < LEAD_SURROGATE) + return 1; + else if (ch < TRAIL_SURROGATE) return 2; + else if (ch <= TRAIL_SURROGATE_MAX) + return 0; //unexpected trail surrogate! else - { - assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected return 1; - } } @@ -119,19 +109,27 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu for ( ; first != last; ++first) { CodePoint cp = static_cast<Char16>(*first); - if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX) + switch (getUtf16Len(static_cast<Char16>(cp))) { - if (++first == last) - { - assert(false); //low surrogate expected - return; - } - assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected - cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000; + case 0: //invalid utf16 character + cp = REPLACEMENT_CHAR; + break; + case 1: + break; + case 2: + if (++first != last) //trail surrogate expected! + { + const Char16 ch = static_cast<Char16>(*first); + if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! + { + cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; + break; + } + } + --first; + cp = REPLACEMENT_CHAR; + break; } - else - assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected - writeOutput(cp); } } @@ -141,6 +139,7 @@ template <class Function> inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 { //http://en.wikipedia.org/wiki/UTF-8 + //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8 if (cp < 0x80) writeOutput(static_cast<Char8>(cp)); @@ -151,23 +150,24 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un } else if (cp < 0x10000) { - writeOutput(static_cast<Char8>((cp >> 12 ) | 0xe0)); + writeOutput(static_cast<Char8>( (cp >> 12 ) | 0xe0)); writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); + writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80)); } - else + else if (cp <= CODE_POINT_MAX) { - assert(cp <= CODE_POINT_MAX); - writeOutput(static_cast<Char8>((cp >> 18 ) | 0xf0)); + writeOutput(static_cast<Char8>( (cp >> 18 ) | 0xf0)); writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80)); writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); + writeOutput(static_cast<Char8>( (cp & 0x3f ) | 0x80)); } + else //invalid code point + codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8 } inline -size_t getUtf8Len(unsigned char ch) //ch must be first code unit! +size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error! { if (ch < 0x80) return 1; @@ -177,12 +177,27 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit! return 3; if (ch >> 3 == 0x1e) return 4; - - assert(false); //no valid begin of UTF8 encoding - return 1; + return 0; //innvalid begin of UTF8 encoding } +template <class CharIterator> inline +bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte +{ + if (++first != last) //trail surrogate expected! + { + const Char8 ch = static_cast<Char8>(*first); + if (ch >> 6 == 0x2) //trail surrogate expected! + { + cp = (cp << 6) + (ch & 0x3f); + return true; + } + } + --first; + cp = REPLACEMENT_CHAR; + return false; +} + template <class CharIterator, class Function> inline void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint { @@ -190,57 +205,32 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput for ( ; first != last; ++first) { - auto getChar = [&](Char8& ch) -> bool - { - if (++first == last) - { - assert(false); //low surrogate expected - return false; - } - ch = static_cast<Char8>(*first); - assert(ch >> 6 == 0x2); - return true; - }; - - Char8 ch = static_cast<Char8>(*first); - switch (getUtf8Len(ch)) + CodePoint cp = static_cast<Char8>(*first); + switch (getUtf8Len(static_cast<Char8>(cp))) { + case 0: //invalid utf8 character + cp = REPLACEMENT_CHAR; + break; case 1: - writeOutput(ch); break; case 2: - { - CodePoint cp = (ch & 0x1f) << 6; - if (!getChar(ch)) return; - cp += ch & 0x3f; - writeOutput(cp); - } - break; + cp &= 0x1f; + decodeTrail(first, last, cp); + break; case 3: - { - CodePoint cp = (ch & 0xf) << 12; - if (!getChar(ch)) return; - cp += (ch & 0x3f) << 6; - if (!getChar(ch)) return; - cp += ch & 0x3f; - writeOutput(cp); - } - break; + cp &= 0xf; + if (decodeTrail(first, last, cp)) + decodeTrail(first, last, cp); + break; case 4: - { - CodePoint cp = (ch & 0x7) << 18; - if (!getChar(ch)) return; - cp += (ch & 0x3f) << 12; - if (!getChar(ch)) return; - cp += (ch & 0x3f) << 6; - if (!getChar(ch)) return; - cp += ch & 0x3f; - writeOutput(cp); - } - break; - default: - assert(false); + cp &= 0x7; + if (decodeTrail(first, last, cp)) + if (decodeTrail(first, last, cp)) + decodeTrail(first, last, cp); + if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; + break; } + writeOutput(cp); } } @@ -257,7 +247,10 @@ size_t unicodeLength(const CharString& str, char) //utf8 while (strFirst < strLast) //[!] { ++len; - strFirst += getUtf8Len(*strFirst); //[!] + + size_t utf8len = getUtf8Len(*strFirst); + if (utf8len == 0) ++utf8len; //invalid utf8 character + strFirst += utf8len; } return len; } @@ -275,7 +268,9 @@ size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wc while (strFirst < strLast) //[!] { ++len; - strFirst += getUtf16Len(*strFirst); //[!] + size_t utf16len = getUtf16Len(*strFirst); + if (utf16len == 0) ++utf16len; //invalid utf16 character + strFirst += utf16len; } return len; } @@ -316,7 +311,9 @@ size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-cha size_t utfPos = 0; while (unicodePos-- > 0) { - utfPos += getUtf8Len(strFirst[utfPos]); + size_t utf8len = getUtf8Len(strFirst[utfPos]); + if (utf8len == 0) ++utf8len; //invalid utf8 character + utfPos += utf8len; if (utfPos >= strLen) return strLen; @@ -336,7 +333,9 @@ size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) size_t utfPos = 0; while (unicodePos-- > 0) { - utfPos += getUtf16Len(strFirst[utfPos]); + size_t utf16len = getUtf16Len(strFirst[utfPos]); + if (utf16len == 0) ++utf16len; //invalid utf16 character + utfPos += utf16len; if (utfPos >= strLen) return strLen; |