// ***************************************************************************** // * This file is part of the FreeFileSync project. It is distributed under * // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 * // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved * // ***************************************************************************** #ifndef UTF_H_01832479146991573473545 #define UTF_H_01832479146991573473545 #include "string_tools.h" //copyStringTo namespace zen { //convert all(!) char- and wchar_t-based "string-like" objects applying UTF conversions (but only if necessary!) template TargetString utfTo(const SourceString& str); const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; template bool isValidUtf(const UtfString& str); //check for UTF-8 encoding errors //access unicode characters in UTF-encoded string (char- or wchar_t-based) template size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string template UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast); //----------------------- implementation ---------------------------------- namespace impl { using CodePoint = uint32_t; using Char16 = uint16_t; using Char8 = uint8_t; const CodePoint LEAD_SURROGATE = 0xd800; //1101 1000 0000 0000 LEAD_SURROGATE_MAX = TRAIL_SURROGATE - 1 const CodePoint TRAIL_SURROGATE = 0xdc00; //1101 1100 0000 0000 const CodePoint TRAIL_SURROGATE_MAX = 0xdfff; const CodePoint REPLACEMENT_CHAR = 0xfffd; const CodePoint CODE_POINT_MAX = 0x10ffff; static_assert(LEAD_SURROGATE + TRAIL_SURROGATE + TRAIL_SURROGATE_MAX + REPLACEMENT_CHAR + CODE_POINT_MAX == 1348603); template inline void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { //https://en.wikipedia.org/wiki/UTF-16 if (cp < LEAD_SURROGATE) writeOutput(static_cast(cp)); else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point writeOutput(static_cast(REPLACEMENT_CHAR)); else if (cp <= 0xffff) writeOutput(static_cast(cp)); else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; writeOutput(static_cast( LEAD_SURROGATE + (cp >> 10))); writeOutput(static_cast(TRAIL_SURROGATE + (cp & 0b11'1111'1111))); } else //invalid code point writeOutput(static_cast(REPLACEMENT_CHAR)); } class Utf16Decoder { public: Utf16Decoder(const Char16* str, size_t len) : it_(str), last_(str + len) {} std::optional getNext() { if (it_ == last_) return {}; const Char16 ch = *it_++; CodePoint cp = ch; if (ch < LEAD_SURROGATE || ch > TRAIL_SURROGATE_MAX) //single Char16, no surrogates ; else if (ch < TRAIL_SURROGATE) //two Char16: lead and trail surrogates decodeTrail(cp); //no range check needed: cp is inside [U+010000, U+10FFFF] by construction else //unexpected trail surrogate cp = REPLACEMENT_CHAR; return cp; } private: void decodeTrail(CodePoint& cp) { if (it_ != last_) //trail surrogate expected! { const Char16 ch = *it_; if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! { cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; ++it_; return; } } cp = REPLACEMENT_CHAR; } const Char16* it_; const Char16* const last_; }; //---------------------------------------------------------------------------------------------------------------- template inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 { /* https://en.wikipedia.org/wiki/UTF-8 "high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) [...] must be treated as an invalid byte sequence" */ if (cp <= 0b111'1111) writeOutput(static_cast(cp)); else if (cp <= 0b0111'1111'1111) { writeOutput(static_cast((cp >> 6) | 0b1100'0000)); //110x xxxx writeOutput(static_cast((cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx } else if (cp <= 0b1111'1111'1111'1111) { if (LEAD_SURROGATE <= cp && cp <= TRAIL_SURROGATE_MAX) //[0xd800, 0xdfff] codePointToUtf8(REPLACEMENT_CHAR, writeOutput); else { writeOutput(static_cast( (cp >> 12) | 0b1110'0000)); //1110 xxxx writeOutput(static_cast(((cp >> 6) & 0b11'1111) | 0b1000'0000)); //10xx xxxx writeOutput(static_cast( (cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx } } else if (cp <= CODE_POINT_MAX) { writeOutput(static_cast( (cp >> 18) | 0b1111'0000)); //1111 0xxx writeOutput(static_cast(((cp >> 12) & 0b11'1111) | 0b1000'0000)); //10xx xxxx writeOutput(static_cast(((cp >> 6) & 0b11'1111) | 0b1000'0000)); //10xx xxxx writeOutput(static_cast( (cp & 0b11'1111) | 0b1000'0000)); //10xx xxxx } else //invalid code point codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte UTF8 } class Utf8Decoder { public: Utf8Decoder(const Char8* str, size_t len) : it_(str), last_(str + len) {} std::optional getNext() { if (it_ == last_) return std::nullopt; const Char8 ch = *it_++; CodePoint cp = ch; if (ch < 0x80) //1 byte ; else if (ch >> 5 == 0b110) //2 bytes { cp &= 0b1'1111; if (decodeTrail(cp)) if (cp <= 0b111'1111) //overlong encoding: "correct encoding of a code point uses only the minimum number of bytes required" cp = REPLACEMENT_CHAR; } else if (ch >> 4 == 0b1110) //3 bytes { cp &= 0b1111; if (decodeTrail(cp) && decodeTrail(cp)) if (cp <= 0b0111'1111'1111 || (LEAD_SURROGATE <= cp && cp <= TRAIL_SURROGATE_MAX)) //[0xd800, 0xdfff] are invalid code points cp = REPLACEMENT_CHAR; } else if (ch >> 3 == 0b11110) //4 bytes { cp &= 0b111; if (decodeTrail(cp) && decodeTrail(cp) && decodeTrail(cp)) if (cp <= 0b1111'1111'1111'1111 || cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; } else //invalid begin of UTF8 encoding cp = REPLACEMENT_CHAR; return cp; } private: bool decodeTrail(CodePoint& cp) { if (it_ != last_) //trail surrogate expected! { const Char8 ch = *it_; if (ch >> 6 == 0b10) //trail surrogate expected! { cp = (cp << 6) + (ch & 0b11'1111); ++it_; return true; } } cp = REPLACEMENT_CHAR; return false; } const Char8* it_; const Char8* const last_; }; //---------------------------------------------------------------------------------------------------------------- template inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant) { codePointToUtf8 (cp, writeOutput); } //UTF8-char template inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant) { codePointToUtf16(cp, writeOutput); } //Windows: UTF16-wchar_t template inline void codePointToUtfImpl(CodePoint cp, Function writeOutput, std::integral_constant) { writeOutput(cp); } //other OS: UTF32-wchar_t //---------------------------------------------------------------------------------------------------------------- template class UtfDecoderImpl; template class UtfDecoderImpl //UTF8-char { public: UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} std::optional getNext() { return decoder_.getNext(); } private: Utf8Decoder decoder_; }; template class UtfDecoderImpl //Windows: UTF16-wchar_t { public: UtfDecoderImpl(const CharType* str, size_t len) : decoder_(reinterpret_cast(str), len) {} std::optional getNext() { return decoder_.getNext(); } private: Utf16Decoder decoder_; }; template class UtfDecoderImpl //other OS: UTF32-wchar_t { public: UtfDecoderImpl(const CharType* str, size_t len) : it_(reinterpret_cast(str)), last_(it_ + len) {} std::optional getNext() { if (it_ == last_) return {}; return *it_++; } private: const CodePoint* it_; const CodePoint* last_; }; } template using UtfDecoder = impl::UtfDecoderImpl; template inline void codePointToUtf(impl::CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a CharType { return impl::codePointToUtfImpl(cp, writeOutput, std::integral_constant()); } //------------------------------------------------------------------------------------------- template inline bool isValidUtf(const UtfString& str) { using namespace impl; UtfDecoder> decoder(strBegin(str), strLength(str)); while (const std::optional cp = decoder.getNext()) if (*cp == REPLACEMENT_CHAR) return false; return true; } template inline size_t unicodeLength(const UtfString& str) //return number of code points (+ correctly handle broken UTF encoding) { size_t uniLen = 0; UtfDecoder> decoder(strBegin(str), strLength(str)); while (decoder.getNext()) ++uniLen; return uniLen; } template inline UtfString getUnicodeSubstring(const UtfString& str, size_t uniPosFirst, size_t uniPosLast) //return position of unicode char in UTF-encoded string { assert(uniPosFirst <= uniPosLast && uniPosLast <= unicodeLength(str)); using namespace impl; using CharType = GetCharTypeT; UtfString output; assert(uniPosFirst <= uniPosLast); if (uniPosFirst >= uniPosLast) //optimize for empty range return output; UtfDecoder decoder(strBegin(str), strLength(str)); for (size_t uniPos = 0; std::optional cp = decoder.getNext(); ++uniPos) //[!] declaration in condition part of the for-loop if (uniPos >= uniPosFirst) { if (uniPos >= uniPosLast) break; codePointToUtf(*cp, [&](CharType c) { output += c; }); } return output; } //------------------------------------------------------------------------------------------- namespace impl { template inline TargetString utfTo(const SourceString& str, std::true_type) { return copyStringTo(str); } template inline TargetString utfTo(const SourceString& str, std::false_type) { using CharSrc = GetCharTypeT; using CharTrg = GetCharTypeT; static_assert(sizeof(CharSrc) != sizeof(CharTrg)); TargetString output; UtfDecoder decoder(strBegin(str), strLength(str)); while (const std::optional cp = decoder.getNext()) codePointToUtf(*cp, [&](CharTrg c) { output += c; }); return output; } } template inline TargetString utfTo(const SourceString& str) { return impl::utfTo(str, std::bool_constant) == sizeof(GetCharTypeT)>()); } } #endif //UTF_H_01832479146991573473545