1 files changed, 99 insertions, 100 deletions
diff --git a/zen/utf.h b/zen/utf.h
index 27804a21..8da588cd 100644
--- a/zen/utf.h
+++ b/zen/utf.h
@@ -50,18 +50,6 @@ size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return positio
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
 //----------------------- implementation ----------------------------------
 namespace implementation
 {
@@ -69,45 +57,47 @@ typedef std::uint_fast32_t CodePoint; //must be at least four bytes
 typedef std::uint_fast16_t Char16;    //we need an unsigned type
 typedef unsigned char Char8;
 
-const CodePoint CODE_POINT_MAX     = 0x10ffff;
+const CodePoint LEAD_SURROGATE      = 0xd800;
+const CodePoint TRAIL_SURROGATE     = 0xdc00; //== LEAD_SURROGATE_MAX + 1
+const CodePoint TRAIL_SURROGATE_MAX = 0xdfff;
 
-const CodePoint HIGH_SURROGATE     = 0xd800;
-const CodePoint HIGH_SURROGATE_MAX = 0xdbff;
-
-const CodePoint LOW_SURROGATE      = 0xdc00;
-const CodePoint LOW_SURROGATE_MAX  = 0xdfff;
+const CodePoint REPLACEMENT_CHAR    = 0xfffd;
+const CodePoint CODE_POINT_MAX      = 0x10ffff;
 
 
 template <class Function> inline
 void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
 {
     //http://en.wikipedia.org/wiki/UTF-16
-    assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
-    assert(cp <= CODE_POINT_MAX);
 
-    if (cp < 0x10000)
+    if (cp < LEAD_SURROGATE)
         writeOutput(static_cast<Char16>(cp));
-    else
+    else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
+        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
+    else if (cp < 0x10000)
+        writeOutput(static_cast<Char16>(cp));
+    else if (cp <= CODE_POINT_MAX)
     {
         cp -= 0x10000;
-        writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE));
-        writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE));
+        writeOutput(LEAD_SURROGATE  + static_cast<Char16>(cp >> 10));
+        writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
     }
+    else //invalid code point
+        codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
 }
 
 
 inline
-size_t getUtf16Len(Char16 ch) //ch must be first code unit!
+size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
 {
-    const CodePoint cp = ch;
-
-    if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
+    if (ch < LEAD_SURROGATE)
+        return 1;
+    else if (ch < TRAIL_SURROGATE)
         return 2;
+    else if (ch <= TRAIL_SURROGATE_MAX)
+        return 0; //unexpected trail surrogate!
     else
-    {
-        assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
         return 1;
-    }
 }
 
 
@@ -119,19 +109,27 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu
     for ( ; first != last; ++first)
     {
         CodePoint cp = static_cast<Char16>(*first);
-        if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
+        switch (getUtf16Len(static_cast<Char16>(cp)))
         {
-            if (++first == last)
-            {
-                assert(false); //low surrogate expected
-                return;
-            }
-            assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
-            cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
+            case 0: //invalid utf16 character
+                cp = REPLACEMENT_CHAR;
+                break;
+            case 1:
+                break;
+            case 2:
+                if (++first != last) //trail surrogate expected!
+                {
+                    const Char16 ch = static_cast<Char16>(*first);
+                    if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
+                    {
+                        cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
+                        break;
+                    }
+                }
+                --first;
+                cp = REPLACEMENT_CHAR;
+                break;
         }
-        else
-            assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
-
         writeOutput(cp);
     }
 }
@@ -141,6 +139,7 @@ template <class Function> inline
 void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
 {
     //http://en.wikipedia.org/wiki/UTF-8
+    //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8
 
     if (cp < 0x80)
         writeOutput(static_cast<Char8>(cp));
@@ -151,23 +150,24 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un
     }
     else if (cp < 0x10000)
     {
-        writeOutput(static_cast<Char8>((cp >> 12        ) | 0xe0));
+        writeOutput(static_cast<Char8>( (cp >> 12       ) | 0xe0));
         writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
-        writeOutput(static_cast<Char8>((cp & 0x3f       ) | 0x80));
+        writeOutput(static_cast<Char8>( (cp & 0x3f      ) | 0x80));
     }
-    else
+    else if (cp <= CODE_POINT_MAX)
     {
-        assert(cp <= CODE_POINT_MAX);
-        writeOutput(static_cast<Char8>((cp >> 18         ) | 0xf0));
+        writeOutput(static_cast<Char8>( (cp >> 18        ) | 0xf0));
         writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
         writeOutput(static_cast<Char8>(((cp >> 6)  & 0x3f) | 0x80));
-        writeOutput(static_cast<Char8>((cp & 0x3f        ) | 0x80));
+        writeOutput(static_cast<Char8>( (cp & 0x3f       ) | 0x80));
     }
+    else //invalid code point
+        codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
 }
 
 
 inline
-size_t getUtf8Len(unsigned char ch) //ch must be first code unit!
+size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
 {
     if (ch < 0x80)
         return 1;
@@ -177,12 +177,27 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit!
         return 3;
     if (ch >> 3 == 0x1e)
         return 4;
-
-    assert(false); //no valid begin of UTF8 encoding
-    return 1;
+    return 0; //innvalid begin of UTF8 encoding
 }
 
 
+template <class CharIterator> inline
+bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
+{
+    if (++first != last) //trail surrogate expected!
+    {
+        const Char8 ch = static_cast<Char8>(*first);
+        if (ch >> 6 == 0x2) //trail surrogate expected!
+        {
+            cp = (cp << 6) + (ch & 0x3f);
+            return true;
+        }
+    }
+    --first;
+    cp = REPLACEMENT_CHAR;
+    return false;
+}
+
 template <class CharIterator, class Function> inline
 void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
 {
@@ -190,57 +205,32 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput
 
     for ( ; first != last; ++first)
     {
-        auto getChar = [&](Char8& ch) -> bool
-        {
-            if (++first == last)
-            {
-                assert(false); //low surrogate expected
-                return false;
-            }
-            ch = static_cast<Char8>(*first);
-            assert(ch >> 6 == 0x2);
-            return true;
-        };
-
-        Char8 ch = static_cast<Char8>(*first);
-        switch (getUtf8Len(ch))
+        CodePoint cp = static_cast<Char8>(*first);
+        switch (getUtf8Len(static_cast<Char8>(cp)))
         {
+            case 0: //invalid utf8 character
+                cp = REPLACEMENT_CHAR;
+                break;
             case 1:
-                writeOutput(ch);
                 break;
             case 2:
-            {
-                CodePoint cp = (ch & 0x1f) << 6;
-                if (!getChar(ch)) return;
-                cp += ch & 0x3f;
-                writeOutput(cp);
-            }
-            break;
+                cp &= 0x1f;
+                decodeTrail(first, last, cp);
+                break;
             case 3:
-            {
-                CodePoint cp = (ch & 0xf) << 12;
-                if (!getChar(ch)) return;
-                cp += (ch & 0x3f) << 6;
-                if (!getChar(ch)) return;
-                cp += ch & 0x3f;
-                writeOutput(cp);
-            }
-            break;
+                cp &= 0xf;
+                if (decodeTrail(first, last, cp))
+                    decodeTrail(first, last, cp);
+                break;
             case 4:
-            {
-                CodePoint cp = (ch & 0x7) << 18;
-                if (!getChar(ch)) return;
-                cp += (ch & 0x3f) << 12;
-                if (!getChar(ch)) return;
-                cp += (ch & 0x3f) << 6;
-                if (!getChar(ch)) return;
-                cp += ch & 0x3f;
-                writeOutput(cp);
-            }
-            break;
-            default:
-                assert(false);
+                cp &= 0x7;
+                if (decodeTrail(first, last, cp))
+                    if (decodeTrail(first, last, cp))
+                        decodeTrail(first, last, cp);
+                if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
+                break;
         }
+        writeOutput(cp);
     }
 }
 
@@ -257,7 +247,10 @@ size_t unicodeLength(const CharString& str, char) //utf8
     while (strFirst < strLast) //[!]
     {
         ++len;
-        strFirst += getUtf8Len(*strFirst); //[!]
+
+        size_t utf8len = getUtf8Len(*strFirst);
+        if (utf8len == 0) ++utf8len; //invalid utf8 character
+        strFirst += utf8len;
     }
     return len;
 }
@@ -275,7 +268,9 @@ size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wc
     while (strFirst < strLast) //[!]
     {
         ++len;
-        strFirst += getUtf16Len(*strFirst); //[!]
+        size_t utf16len = getUtf16Len(*strFirst);
+        if (utf16len == 0) ++utf16len; //invalid utf16 character
+        strFirst += utf16len;
     }
     return len;
 }
@@ -316,7 +311,9 @@ size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-cha
     size_t utfPos = 0;
     while (unicodePos-- > 0)
     {
-        utfPos += getUtf8Len(strFirst[utfPos]);
+        size_t utf8len = getUtf8Len(strFirst[utfPos]);
+        if (utf8len == 0) ++utf8len; //invalid utf8 character
+        utfPos += utf8len;
 
         if (utfPos >= strLen)
             return strLen;
@@ -336,7 +333,9 @@ size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>)
     size_t utfPos = 0;
     while (unicodePos-- > 0)
     {
-        utfPos += getUtf16Len(strFirst[utfPos]);
+        size_t utf16len = getUtf16Len(strFirst[utfPos]);
+        if (utf16len == 0) ++utf16len; //invalid utf16 character
+        utfPos += utf16len;
 
         if (utfPos >= strLen)
             return strLen;