diff options
Diffstat (limited to 'zen/utf.h')
-rw-r--r-- | zen/utf.h | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/zen/utf.h b/zen/utf.h new file mode 100644 index 00000000..2ba6eaa3 --- /dev/null +++ b/zen/utf.h @@ -0,0 +1,457 @@ +// ************************************************************************** +// * This file is part of the zenXML project. It is distributed under the * +// * Boost Software License, Version 1.0. See accompanying file * +// * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt. * +// * Copyright (C) ZenJu (zhnmju123 AT gmx DOT de) - All Rights Reserved * +// ************************************************************************** + +#ifndef STRING_UTF8_HEADER_01832479146991573473545 +#define STRING_UTF8_HEADER_01832479146991573473545 + +#include <cstdint> +#include <iterator> +#include "string_tools.h" //copyStringTo + +namespace zen +{ +//convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!) +template <class TargetString, class SourceString> +TargetString utfCvrtTo(const SourceString& str); + +const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; + +//---- explicit conversion: wide <-> utf8 ---- +template <class CharString, class WideString> +CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8<std::string>(L"abc"); + +template <class WideString, class CharString> +WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide<std::wstring>("abc"); + +//access unicode characters in UTF-encoded string (char- or wchar_t-based) +template <class UtfString> +size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string + +template <class UtfString> +size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +//----------------------- implementation ---------------------------------- +namespace implementation +{ +typedef std::uint_fast32_t CodePoint; //must be at least four bytes +typedef std::uint_fast16_t Char16; //we need an unsigned type +typedef unsigned char Char8; + +const CodePoint CODE_POINT_MAX = 0x10ffff; + +const CodePoint HIGH_SURROGATE = 0xd800; +const CodePoint HIGH_SURROGATE_MAX = 0xdbff; + +const CodePoint LOW_SURROGATE = 0xdc00; +const CodePoint LOW_SURROGATE_MAX = 0xdfff; + + +template <class Function> inline +void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 +{ + //http://en.wikipedia.org/wiki/UTF-16 + assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16 + assert(cp <= CODE_POINT_MAX); + + if (cp < 0x10000) + writeOutput(static_cast<Char16>(cp)); + else + { + cp -= 0x10000; + writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE)); + writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE)); + } +} + + +inline +size_t getUtf16Len(Char16 ch) //ch must be first code unit! +{ + const CodePoint cp = ch; + + if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX) + return 2; + else + { + assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected + return 1; + } +} + + +template <class CharIterator, class Function> inline +void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +{ + assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2); + + for ( ; first != last; ++first) + { + CodePoint cp = static_cast<Char16>(*first); + if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX) + { + if (++first == last) + { + assert(false); //low surrogate expected + return; + } + assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected + cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000; + } + else + assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected + + writeOutput(cp); + } +} + + +template <class Function> inline +void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 +{ + //http://en.wikipedia.org/wiki/UTF-8 + + if (cp < 0x80) + writeOutput(static_cast<Char8>(cp)); + else if (cp < 0x800) + { + writeOutput(static_cast<Char8>((cp >> 6 ) | 0xc0)); + writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80)); + } + else if (cp < 0x10000) + { + writeOutput(static_cast<Char8>((cp >> 12 ) | 0xe0)); + writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); + } + else + { + assert(cp <= CODE_POINT_MAX); + writeOutput(static_cast<Char8>((cp >> 18 ) | 0xf0)); + writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); + } +} + + +inline +size_t getUtf8Len(unsigned char ch) //ch must be first code unit! +{ + if (ch < 0x80) + return 1; + if (ch >> 5 == 0x6) + return 2; + if (ch >> 4 == 0xe) + return 3; + if (ch >> 3 == 0x1e) + return 4; + + assert(false); //no valid begin of UTF8 encoding + return 1; +} + + +template <class CharIterator, class Function> inline +void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint +{ + assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1); + + for ( ; first != last; ++first) + { + auto getChar = [&](Char8& ch) -> bool + { + if (++first == last) + { + assert(false); //low surrogate expected + return false; + } + ch = static_cast<Char8>(*first); + assert(ch >> 6 == 0x2); + return true; + }; + + Char8 ch = static_cast<Char8>(*first); + switch (getUtf8Len(ch)) + { + case 1: + writeOutput(ch); + break; + case 2: + { + CodePoint cp = (ch & 0x1f) << 6; + if (!getChar(ch)) return; + cp += ch & 0x3f; + writeOutput(cp); + } + break; + case 3: + { + CodePoint cp = (ch & 0xf) << 12; + if (!getChar(ch)) return; + cp += (ch & 0x3f) << 6; + if (!getChar(ch)) return; + cp += ch & 0x3f; + writeOutput(cp); + } + break; + case 4: + { + CodePoint cp = (ch & 0x7) << 18; + if (!getChar(ch)) return; + cp += (ch & 0x3f) << 12; + if (!getChar(ch)) return; + cp += (ch & 0x3f) << 6; + if (!getChar(ch)) return; + cp += ch & 0x3f; + writeOutput(cp); + } + break; + default: + assert(false); + } + } +} + + +template <class CharString> inline +size_t unicodeLength(const CharString& str, char) //utf8 +{ + typedef typename GetCharType<CharString>::Type CharType; + + const CharType* strFirst = strBegin(str); + const CharType* const strLast = strFirst + strLength(str); + + size_t len = 0; + while (strFirst < strLast) //[!] + { + ++len; + strFirst += getUtf8Len(*strFirst); //[!] + } + return len; +} + + +template <class WideString> inline +size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t +{ + typedef typename GetCharType<WideString>::Type CharType; + + const CharType* strFirst = strBegin(str); + const CharType* const strLast = strFirst + strLength(str); + + size_t len = 0; + while (strFirst < strLast) //[!] + { + ++len; + strFirst += getUtf16Len(*strFirst); //[!] + } + return len; +} + + +template <class WideString> inline +size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t +{ + return strLength(str); +} + + +template <class WideString> inline +size_t unicodeLength(const WideString& str, wchar_t) +{ + return unicodeLengthWide(str, Int2Type<sizeof(wchar_t)>()); +} +} + + +template <class UtfString> inline +size_t unicodeLength(const UtfString& str) //return number of code points +{ + return implementation::unicodeLength(str, typename GetCharType<UtfString>::Type()); +} + + +namespace implementation +{ +template <class CharString> inline +size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char +{ + typedef typename GetCharType<CharString>::Type CharType; + + const CharType* strFirst = strBegin(str); + const size_t strLen = strLength(str); + + size_t utfPos = 0; + while (unicodePos-- > 0) + { + utfPos += getUtf8Len(strFirst[utfPos]); + + if (utfPos >= strLen) + return strLen; + } + return utfPos; +} + + +template <class WideString> inline +size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t +{ + typedef typename GetCharType<WideString>::Type CharType; + + const CharType* strFirst = strBegin(str); + const size_t strLen = strLength(str); + + size_t utfPos = 0; + while (unicodePos-- > 0) + { + utfPos += getUtf16Len(strFirst[utfPos]); + + if (utfPos >= strLen) + return strLen; + } + return utfPos; +} + + +template <class WideString> inline +size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t +{ + return std::min(strLength(str), unicodePos); +} + + +template <class UtfString> inline +size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t) +{ + return findUnicodePosWide(str, unicodePos, Int2Type<sizeof(wchar_t)>()); +} +} + + +template <class UtfString> inline +size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string +{ + return implementation::findUnicodePos(str, unicodePos, typename GetCharType<UtfString>::Type()); +} + +//------------------------------------------------------------------------------------------- + +namespace implementation +{ +template <class WideString, class CharString> inline +WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t +{ + WideString output; + utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), + [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); }); + return output; +} + + +template <class WideString, class CharString> inline +WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t +{ + WideString output; + utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), + [&](CodePoint cp) { output += static_cast<wchar_t>(cp); }); + return output; +} + + +template <class CharString, class WideString> inline +CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8 +{ + CharString output; + utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), + [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); + return output; +} + + +template <class CharString, class WideString> inline +CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8 +{ + CharString output; + std::for_each(strBegin(str), strBegin(str) + strLength(str), + [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); + return output; +} +} + + +template <class WideString, class CharString> inline +WideString utf8ToWide(const CharString& str) +{ + assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value)); + assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value)); + + return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>()); +} + + +template <class CharString, class WideString> inline +CharString wideToUtf8(const WideString& str) +{ + assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value)); + assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value)); + + return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>()); +} + +//------------------------------------------------------------------------------------------- + +template <class TargetString, class SourceString> inline +TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); } + +template <class TargetString, class SourceString> inline +TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); } + +template <class TargetString, class SourceString> inline +TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); } + +template <class TargetString, class SourceString> inline +TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); } + +template <class TargetString, class SourceString> inline +TargetString utfCvrtTo(const SourceString& str) +{ + return utfCvrtTo<TargetString>(str, + typename GetCharType<SourceString>::Type(), + typename GetCharType<TargetString>::Type()); +} +} + +#endif //STRING_UTF8_HEADER_01832479146991573473545 |