From 3ba62ef1de77153e5a8c7bad4451b96f6a1678b0 Mon Sep 17 00:00:00 2001 From: Daniel Wilhelm Date: Sun, 12 Mar 2017 22:00:35 -0600 Subject: 8.10 --- zen/utf.h | 421 +++++++++++++++++++++++++------------------------------------- 1 file changed, 166 insertions(+), 255 deletions(-) (limited to 'zen/utf.h') diff --git a/zen/utf.h b/zen/utf.h index 41fdf58c..ab8fda50 100755 --- a/zen/utf.h +++ b/zen/utf.h @@ -10,40 +10,25 @@ #include #include #include "string_tools.h" //copyStringTo +#include "optional.h" namespace zen { //convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!) template -TargetString utfCvrtTo(const SourceString& str); +TargetString utfTo(const SourceString& str); const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; -template -bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors - -//---- explicit conversion: wide <-> utf8 ---- -template -CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8(L"abc"); - -template -WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide("abc"); +template +bool isValidUtf(const UtfString& str); //check for UTF-8 encoding errors //access unicode characters in UTF-encoded string (char- or wchar_t-based) template size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string template -size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string - - - - - - - - - +UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast); @@ -58,7 +43,7 @@ namespace implementation { using CodePoint = uint32_t; using Char16 = uint16_t; -using Char8 = unsigned char; +using Char8 = uint8_t; const CodePoint LEAD_SURROGATE = 0xd800; const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1 @@ -72,7 +57,6 @@ template inline void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { //http://en.wikipedia.org/wiki/UTF-16 - if (cp < LEAD_SURROGATE) writeOutput(static_cast(cp)); else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point @@ -82,8 +66,8 @@ void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a u else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; - writeOutput(LEAD_SURROGATE + static_cast(cp >> 10)); - writeOutput(TRAIL_SURROGATE + static_cast(cp & 0x3ff)); + writeOutput(static_cast( LEAD_SURROGATE + (cp >> 10))); + writeOutput(static_cast(TRAIL_SURROGATE + (cp & 0x3ff))); } else //invalid code point codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 @@ -104,15 +88,19 @@ size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error! } -template inline -void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +class Utf16Decoder { - static_assert(sizeof(typename std::iterator_traits::value_type) == 2, ""); +public: + Utf16Decoder(const Char16* str, size_t len) : it_(str), last_(str + len) {} - for ( ; first != last; ++first) + Opt getNext() { - CodePoint cp = static_cast(*first); - switch (getUtf16Len(static_cast(cp))) + if (it_ == last_) + return NoValue(); + + const Char16 ch = *it_++; + CodePoint cp = ch; + switch (getUtf16Len(ch)) { case 0: //invalid utf16 character cp = REPLACEMENT_CHAR; @@ -120,23 +108,33 @@ void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutpu case 1: break; case 2: - if (++first != last) //trail surrogate expected! - { - const Char16 ch = static_cast(*first); - if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! - { - cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; - break; - } - } - --first; - cp = REPLACEMENT_CHAR; + decodeTrail(cp); break; } - writeOutput(cp); + return cp; + } + +private: + void decodeTrail(CodePoint& cp) + { + if (it_ != last_) //trail surrogate expected! + { + const Char16 ch = *it_; + if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! + { + cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; + ++it_; + return; + } + } + cp = REPLACEMENT_CHAR; } -} + const Char16* it_; + const Char16* const last_; +}; + +//---------------------------------------------------------------------------------------------------------------- template inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 @@ -155,14 +153,14 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un { writeOutput(static_cast( (cp >> 12 ) | 0xe0)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast( (cp & 0x3f ) | 0x80)); + writeOutput(static_cast( (cp & 0x3f) | 0x80)); } else if (cp <= CODE_POINT_MAX) { writeOutput(static_cast( (cp >> 18 ) | 0xf0)); writeOutput(static_cast(((cp >> 12) & 0x3f) | 0x80)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); - writeOutput(static_cast( (cp & 0x3f ) | 0x80)); + writeOutput(static_cast( (cp & 0x3f) | 0x80)); } else //invalid code point codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8 @@ -170,7 +168,7 @@ void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a un inline -size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error! +size_t getUtf8Len(Char8 ch) //ch must be first code unit! returns 0 on error! { if (ch < 0x80) return 1; @@ -184,32 +182,19 @@ size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on e } -template inline -bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte -{ - if (++first != last) //trail surrogate expected! - { - const Char8 ch = static_cast(*first); - if (ch >> 6 == 0x2) //trail surrogate expected! - { - cp = (cp << 6) + (ch & 0x3f); - return true; - } - } - --first; - cp = REPLACEMENT_CHAR; - return false; -} - -template inline -void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +class Utf8Decoder { - static_assert(sizeof(typename std::iterator_traits::value_type) == 1, ""); +public: + Utf8Decoder(const Char8* str, size_t len) : it_(str), last_(str + len) {} - for ( ; first != last; ++first) + Opt getNext() { - CodePoint cp = static_cast(*first); - switch (getUtf8Len(static_cast(cp))) + if (it_ == last_) + return NoValue(); + + const Char8 ch = *it_++; + CodePoint cp = ch; + switch (getUtf8Len(ch)) { case 0: //invalid utf8 character cp = REPLACEMENT_CHAR; @@ -218,258 +203,184 @@ void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput break; case 2: cp &= 0x1f; - decodeTrail(first, last, cp); + decodeTrail(cp); break; case 3: cp &= 0xf; - if (decodeTrail(first, last, cp)) - decodeTrail(first, last, cp); + if (decodeTrail(cp)) + decodeTrail(cp); break; case 4: cp &= 0x7; - if (decodeTrail(first, last, cp)) - if (decodeTrail(first, last, cp)) - decodeTrail(first, last, cp); + if (decodeTrail(cp)) + if (decodeTrail(cp)) + decodeTrail(cp); if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; break; } - writeOutput(cp); + return cp; } -} - - -template inline -size_t unicodeLength(const CharString& str, char) //utf8 -{ - using CharType = typename GetCharType::Type; - const CharType* strFirst = strBegin(str); - const CharType* const strLast = strFirst + strLength(str); - - size_t len = 0; - while (strFirst < strLast) //[!] +private: + bool decodeTrail(CodePoint& cp) { - ++len; - size_t utf8len = getUtf8Len(*strFirst); - if (utf8len == 0) ++utf8len; //invalid utf8 character - strFirst += utf8len; + if (it_ != last_) //trail surrogate expected! + { + const Char8 ch = *it_; + if (ch >> 6 == 0x2) //trail surrogate expected! + { + cp = (cp << 6) + (ch & 0x3f); + ++it_; + return true; + } + } + cp = REPLACEMENT_CHAR; + return false; } - return len; -} + const Char8* it_; + const Char8* const last_; +}; -template inline -size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t -{ - using CharType = typename GetCharType::Type; +//---------------------------------------------------------------------------------------------------------------- - const CharType* strFirst = strBegin(str); - const CharType* const strLast = strFirst + strLength(str); +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<1>) { codePointToUtf8 (cp, writeOutput); } //UTF8-char +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<2>) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t +template inline void codePointToUtf(CodePoint cp, Function writeOutput, Int2Type<4>) { writeOutput(cp); } //other OS: UTF32-wchar_t - size_t len = 0; - while (strFirst < strLast) //[!] - { - ++len; - size_t utf16len = getUtf16Len(*strFirst); - if (utf16len == 0) ++utf16len; //invalid utf16 character - strFirst += utf16len; - } - return len; -} - - -template inline -size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t +template inline +void codePointToUtf(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType { - return strLength(str); + return codePointToUtf(cp, writeOutput, Int2Type()); } +//---------------------------------------------------------------------------------------------------------------- -template inline -size_t unicodeLength(const WideString& str, wchar_t) -{ - return unicodeLengthWide(str, Int2Type()); -} -} +template +class UtfDecoderImpl; -template inline -size_t unicodeLength(const UtfString& str) //return number of code points +template +class UtfDecoderImpl //UTF8-char { - return implementation::unicodeLength(str, typename GetCharType::Type()); -} +public: + UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} + Opt getNext() { return decoder_.getNext(); } +private: + Utf8Decoder decoder_; +}; -namespace implementation -{ -template inline -size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char +template +class UtfDecoderImpl //Windows: UTF16-wchar_t { - using CharType = typename GetCharType::Type; - - const CharType* strFirst = strBegin(str); - const size_t strLen = strLength(str); - - size_t utfPos = 0; - while (unicodePos-- > 0) - { - if (utfPos >= strLen) - return strLen; +public: + UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} + Opt getNext() { return decoder_.getNext(); } +private: + Utf16Decoder decoder_; +}; - size_t utf8len = getUtf8Len(strFirst[utfPos]); - if (utf8len == 0) ++utf8len; //invalid utf8 character - utfPos += utf8len; - } - if (utfPos >= strLen) - return strLen; - return utfPos; -} - -template inline -size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t +template +class UtfDecoderImpl //other OS: UTF32-wchar_t { - using CharType = typename GetCharType::Type; - - const CharType* strFirst = strBegin(str); - const size_t strLen = strLength(str); - - size_t utfPos = 0; - while (unicodePos-- > 0) +public: + UtfDecoderImpl(const CharType* str, size_t len) : it_(reinterpret_cast(str)), last_(it_ + len) {} + Opt getNext() { - if (utfPos >= strLen) - return strLen; - - size_t utf16len = getUtf16Len(strFirst[utfPos]); - if (utf16len == 0) ++utf16len; //invalid utf16 character - utfPos += utf16len; + if (it_ == last_) + return NoValue(); + return *it_++; } - if (utfPos >= strLen) - return strLen; - return utfPos; -} +private: + const CodePoint* it_; + const CodePoint* last_; +}; -template inline -size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t -{ - return std::min(strLength(str), unicodePos); -} - - -template inline -size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t) -{ - return findUnicodePosWide(str, unicodePos, Int2Type()); -} -} - - -template inline -size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string -{ - return implementation::findUnicodePos(str, unicodePos, typename GetCharType::Type()); +template +using UtfDecoder = UtfDecoderImpl; } //------------------------------------------------------------------------------------------- -namespace implementation -{ -template inline -WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t +template inline +bool isValidUtf(const UtfString& str) { - WideString output; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast(c); }); }); - return output; -} + using namespace implementation; + UtfDecoder::Type> decoder(strBegin(str), strLength(str)); + while (Opt cp = decoder.getNext()) + if (*cp == REPLACEMENT_CHAR) + return false; -template inline -WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t -{ - WideString output; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { output += static_cast(cp); }); - return output; + return true; } -template inline -CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8 +template inline +size_t unicodeLength(const UtfString& str) //return number of code points (+ correctly handle broken UTF encoding) { - CharString output; - utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); - return output; + size_t uniLen = 0; + implementation::UtfDecoder::Type> decoder(strBegin(str), strLength(str)); + while (decoder.getNext()) + ++uniLen; + return uniLen; } -template inline -CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8 +template inline +UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast) //return position of unicode char in UTF-encoded string { - CharString output; - std::for_each(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); + assert(uniPosFirst <= uniPosLast && uniPosLast <= unicodeLength(str)); + using namespace implementation; + using CharType = typename GetCharType::Type; + UtfString output; + if (uniPosFirst >= uniPosLast) //optimize for empty range + return output; + + UtfDecoder decoder(strBegin(str), strLength(str)); + for (size_t uniPos = 0; Opt cp = decoder.getNext(); ++uniPos) //[!] declaration in condition part of the for-loop + if (uniPosFirst <= uniPos) + { + if (uniPos >= uniPosLast) + break; + codePointToUtf(*cp, [&](CharType c) { output += c; }); + } return output; } -} +//------------------------------------------------------------------------------------------- -template inline -bool isValidUtf8(const CharString& str) +namespace implementation { - using namespace implementation; - bool valid = true; - utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) - { - if (cp == REPLACEMENT_CHAR) - valid = false; //perf: should we use an (expensive) exception for iteration break? - }); - return valid; -} - - -template inline -WideString utf8ToWide(const CharString& str) +template inline +TargetString utfTo(const SourceString& str, FalseType) { - static_assert(IsSameType::Type, char >::value, ""); - static_assert(IsSameType::Type, wchar_t>::value, ""); + using CharSrc = typename GetCharType::Type; + using CharTrg = typename GetCharType::Type; + static_assert(sizeof(CharSrc) != sizeof(CharTrg), "no UTF-conversion needed"); - return implementation::utf8ToWide(str, Int2Type()); -} + TargetString output; + UtfDecoder decoder(strBegin(str), strLength(str)); + while (Opt cp = decoder.getNext()) + codePointToUtf(*cp, [&](CharTrg c) { output += c; }); -template inline -CharString wideToUtf8(const WideString& str) -{ - static_assert(IsSameType::Type, char >::value, ""); - static_assert(IsSameType::Type, wchar_t>::value, ""); - - return implementation::wideToUtf8(str, Int2Type()); + return output; } -//------------------------------------------------------------------------------------------- template inline -TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide(str); } - -template inline -TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8(str); } - -template inline -TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo(str); } +TargetString utfTo(const SourceString& str, TrueType) { return copyStringTo(str); } +} -template inline -TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo(str); } template inline -TargetString utfCvrtTo(const SourceString& str) +TargetString utfTo(const SourceString& str) { - return utfCvrtTo(str, - typename GetCharType::Type(), - typename GetCharType::Type()); + return implementation::utfTo(str, StaticBool::Type) == sizeof(typename GetCharType::Type)>()); } } -- cgit