// ************************************************************************** // * This file is part of the zen::Xml project. It is distributed under the * // * Boost Software License: http://www.boost.org/LICENSE_1_0.txt * // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved * // ************************************************************************** #ifndef STRING_UTF8_HEADER_01832479146991573473545 #define STRING_UTF8_HEADER_01832479146991573473545 #include #include #include "string_tools.h" //copyStringTo namespace zen { //convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!) template TargetString utfCvrtTo(const SourceString& str); const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; //---- explicit conversion: wide <-> utf8 ---- template CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8(L"abc"); template WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide("abc"); //access unicode characters in UTF-encoded string (char- or wchar_t-based) template size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string template size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string //----------------------- implementation ---------------------------------- namespace implementation { typedef std::uint_fast32_t CodePoint; //must be at least four bytes typedef std::uint_fast16_t Char16; //we need an unsigned type typedef unsigned char Char8; const CodePoint LEAD_SURROGATE = 0xd800; const CodePoint TRAIL_SURROGATE = 0xdc00; //== LEAD_SURROGATE_MAX + 1 const CodePoint TRAIL_SURROGATE_MAX = 0xdfff; const CodePoint REPLACEMENT_CHAR = 0xfffd; const CodePoint CODE_POINT_MAX = 0x10ffff; template inline void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { //http://en.wikipedia.org/wiki/UTF-16 if (cp < LEAD_SURROGATE) writeOutput(static_cast(cp)); else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 else if (cp < 0x10000) writeOutput(static_cast(cp)); else if (cp <= CODE_POINT_MAX) { cp -= 0x10000; writeOutput(LEAD_SURROGATE + static_cast(cp >> 10)); writeOutput(TRAIL_SURROGATE + static_cast(cp & 0x3ff)); } else //invalid code point codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16 } inline size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error! { if (ch < LEAD_SURROGATE) return 1; else if (ch < TRAIL_SURROGATE) return 2; else if (ch <= TRAIL_SURROGATE_MAX) return 0; //unexpected trail surrogate! else return 1; } template inline void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint { assert_static(sizeof(typename std::iterator_traits::value_type) == 2); for ( ; first != last; ++first) { CodePoint cp = static_cast(*first); switch (getUtf16Len(static_cast(cp))) { case 0: //invalid utf16 character cp = REPLACEMENT_CHAR; break; case 1: break; case 2: if (++first != last) //trail surrogate expected! { const Char16 ch = static_cast(*first); if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected! { cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000; break; } } --first; cp = REPLACEMENT_CHAR; break; } writeOutput(cp); } } template inline void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 { //http://en.wikipedia.org/wiki/UTF-8 //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8 if (cp < 0x80) writeOutput(static_cast(cp)); else if (cp < 0x800) { writeOutput(static_cast((cp >> 6 ) | 0xc0)); writeOutput(static_cast((cp & 0x3f) | 0x80)); } else if (cp < 0x10000) { writeOutput(static_cast( (cp >> 12 ) | 0xe0)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); writeOutput(static_cast( (cp & 0x3f ) | 0x80)); } else if (cp <= CODE_POINT_MAX) { writeOutput(static_cast( (cp >> 18 ) | 0xf0)); writeOutput(static_cast(((cp >> 12) & 0x3f) | 0x80)); writeOutput(static_cast(((cp >> 6) & 0x3f) | 0x80)); writeOutput(static_cast( (cp & 0x3f ) | 0x80)); } else //invalid code point codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8 } inline size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error! { if (ch < 0x80) return 1; if (ch >> 5 == 0x6) return 2; if (ch >> 4 == 0xe) return 3; if (ch >> 3 == 0x1e) return 4; return 0; //innvalid begin of UTF8 encoding } template inline bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte { if (++first != last) //trail surrogate expected! { const Char8 ch = static_cast(*first); if (ch >> 6 == 0x2) //trail surrogate expected! { cp = (cp << 6) + (ch & 0x3f); return true; } } --first; cp = REPLACEMENT_CHAR; return false; } template inline void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint { assert_static(sizeof(typename std::iterator_traits::value_type) == 1); for ( ; first != last; ++first) { CodePoint cp = static_cast(*first); switch (getUtf8Len(static_cast(cp))) { case 0: //invalid utf8 character cp = REPLACEMENT_CHAR; break; case 1: break; case 2: cp &= 0x1f; decodeTrail(first, last, cp); break; case 3: cp &= 0xf; if (decodeTrail(first, last, cp)) decodeTrail(first, last, cp); break; case 4: cp &= 0x7; if (decodeTrail(first, last, cp)) if (decodeTrail(first, last, cp)) decodeTrail(first, last, cp); if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR; break; } writeOutput(cp); } } template inline size_t unicodeLength(const CharString& str, char) //utf8 { typedef typename GetCharType::Type CharType; const CharType* strFirst = strBegin(str); const CharType* const strLast = strFirst + strLength(str); size_t len = 0; while (strFirst < strLast) //[!] { ++len; size_t utf8len = getUtf8Len(*strFirst); if (utf8len == 0) ++utf8len; //invalid utf8 character strFirst += utf8len; } return len; } template inline size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t { typedef typename GetCharType::Type CharType; const CharType* strFirst = strBegin(str); const CharType* const strLast = strFirst + strLength(str); size_t len = 0; while (strFirst < strLast) //[!] { ++len; size_t utf16len = getUtf16Len(*strFirst); if (utf16len == 0) ++utf16len; //invalid utf16 character strFirst += utf16len; } return len; } template inline size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t { return strLength(str); } template inline size_t unicodeLength(const WideString& str, wchar_t) { return unicodeLengthWide(str, Int2Type()); } } template inline size_t unicodeLength(const UtfString& str) //return number of code points { return implementation::unicodeLength(str, typename GetCharType::Type()); } namespace implementation { template inline size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char { typedef typename GetCharType::Type CharType; const CharType* strFirst = strBegin(str); const size_t strLen = strLength(str); size_t utfPos = 0; while (unicodePos-- > 0) { size_t utf8len = getUtf8Len(strFirst[utfPos]); if (utf8len == 0) ++utf8len; //invalid utf8 character utfPos += utf8len; if (utfPos >= strLen) return strLen; } return utfPos; } template inline size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t { typedef typename GetCharType::Type CharType; const CharType* strFirst = strBegin(str); const size_t strLen = strLength(str); size_t utfPos = 0; while (unicodePos-- > 0) { size_t utf16len = getUtf16Len(strFirst[utfPos]); if (utf16len == 0) ++utf16len; //invalid utf16 character utfPos += utf16len; if (utfPos >= strLen) return strLen; } return utfPos; } template inline size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t { return std::min(strLength(str), unicodePos); } template inline size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t) { return findUnicodePosWide(str, unicodePos, Int2Type()); } } template inline size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string { return implementation::findUnicodePos(str, unicodePos, typename GetCharType::Type()); } //------------------------------------------------------------------------------------------- namespace implementation { template inline WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t { WideString output; utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast(c); }); }); return output; } template inline WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t { WideString output; utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), [&](CodePoint cp) { output += static_cast(cp); }); return output; } template inline CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8 { CharString output; utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); return output; } template inline CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8 { CharString output; std::for_each(strBegin(str), strBegin(str) + strLength(str), [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); return output; } } template inline WideString utf8ToWide(const CharString& str) { assert_static((IsSameType::Type, char >::value)); assert_static((IsSameType::Type, wchar_t>::value)); return implementation::utf8ToWide(str, Int2Type()); } template inline CharString wideToUtf8(const WideString& str) { assert_static((IsSameType::Type, char >::value)); assert_static((IsSameType::Type, wchar_t>::value)); return implementation::wideToUtf8(str, Int2Type()); } //------------------------------------------------------------------------------------------- template inline TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide(str); } template inline TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8(str); } template inline TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo(str); } template inline TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo(str); } template inline TargetString utfCvrtTo(const SourceString& str) { return utfCvrtTo(str, typename GetCharType::Type(), typename GetCharType::Type()); } } #endif //STRING_UTF8_HEADER_01832479146991573473545