diff options
Diffstat (limited to 'zen/utf8.h')
-rw-r--r-- | zen/utf8.h | 129 |
1 files changed, 55 insertions, 74 deletions
@@ -8,13 +8,15 @@ #ifndef STRING_UTF8_HEADER_01832479146991573473545 #define STRING_UTF8_HEADER_01832479146991573473545 +#include <cstdint> #include <iterator> #include "string_tools.h" //copyStringTo namespace zen { //convert any(!) "string-like" object into target string by applying a UTF8 conversion (but only if necessary!) -template <class TargetString, class SourceString> TargetString utf8CvrtTo(const SourceString& str); +template <class TargetString, class SourceString> +TargetString utf8CvrtTo(const SourceString& str); //convert wide to utf8 string; example: std::string tmp = toUtf8<std::string>(L"abc"); template <class CharString, class WideString> @@ -62,7 +64,9 @@ const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; //----------------------- implementation ---------------------------------- namespace implementation { -typedef unsigned int CodePoint; //must be at least four bytes +typedef std::uint_fast32_t CodePoint; //must be at least four bytes +typedef std::uint_fast16_t Char16; //we need an unsigned type +typedef unsigned char Char8; const CodePoint CODE_POINT_MAX = 0x10ffff; @@ -73,31 +77,28 @@ const CodePoint LOW_SURROGATE = 0xdc00; const CodePoint LOW_SURROGATE_MAX = 0xdfff; -template <class OutputIterator> inline -OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16 +template <class Function> inline +void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16 { - typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type - + //http://en.wikipedia.org/wiki/UTF-16 assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16 assert(cp <= CODE_POINT_MAX); if (cp < 0x10000) - *result++ = static_cast<Char16>(cp); + writeOutput(static_cast<Char16>(cp)); else { cp -= 0x10000; - *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE); - *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE); + writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE)); + writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE)); } - return result; } template <class CharIterator, class Function> inline -Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter +void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint { assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2); - typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type for ( ; first != last; ++first) { @@ -107,7 +108,7 @@ Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f if (++first == last) { assert(false); //low surrogate expected - break; + return; } assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000; @@ -115,40 +116,37 @@ Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f else assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected - f(cp); + writeOutput(cp); } - return f; } -template <class OutputIterator> inline -OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8 +template <class Function> inline +void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8 { - typedef unsigned char Char8; - - assert(cp <= CODE_POINT_MAX); + //http://en.wikipedia.org/wiki/UTF-8 if (cp < 0x80) - *result++ = static_cast<Char8>(cp); + writeOutput(static_cast<Char8>(cp)); else if (cp < 0x800) { - *result++ = static_cast<Char8>((cp >> 6 ) | 0xc0); - *result++ = static_cast<Char8>((cp & 0x3f) | 0x80); + writeOutput(static_cast<Char8>((cp >> 6 ) | 0xc0)); + writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80)); } else if (cp < 0x10000) { - *result++ = static_cast<Char8>((cp >> 12 ) | 0xe0); - *result++ = static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80); - *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80); + writeOutput(static_cast<Char8>((cp >> 12 ) | 0xe0)); + writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); } else { - *result++ = static_cast<Char8>((cp >> 18 ) | 0xf0); - *result++ = static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80); - *result++ = static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80); - *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80); + assert(cp <= CODE_POINT_MAX); + writeOutput(static_cast<Char8>((cp >> 18 ) | 0xf0)); + writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80)); + writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80)); } - return result; } @@ -170,14 +168,13 @@ size_t getUtf8Len(unsigned char ch) template <class CharIterator, class Function> inline -Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter +void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint { assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1); - typedef unsigned char Char8; for ( ; first != last; ++first) { - auto getChar = [&](Char8 & ch) -> bool + auto getChar = [&](Char8& ch) -> bool { if (++first == last) { @@ -189,71 +186,55 @@ Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f return true; }; - CodePoint cp = static_cast<Char8>(*first); - switch (getUtf8Len(static_cast<Char8>(cp))) + Char8 ch = static_cast<Char8>(*first); + switch (getUtf8Len(ch)) { case 1: + writeOutput(ch); break; case 2: { - cp = (cp & 0x1f) << 6; - Char8 ch; - if (!getChar(ch)) continue; + CodePoint cp = (ch & 0x1f) << 6; + if (!getChar(ch)) return; cp += ch & 0x3f; + writeOutput(cp); } break; case 3: { - cp = (cp & 0xf) << 12; - Char8 ch; - if (!getChar(ch)) continue; + CodePoint cp = (ch & 0xf) << 12; + if (!getChar(ch)) return; cp += (ch & 0x3f) << 6; - if (!getChar(ch)) continue; + if (!getChar(ch)) return; cp += ch & 0x3f; - + writeOutput(cp); } break; case 4: { - cp = (cp & 0x7) << 18; - Char8 ch; - if (!getChar(ch)) continue; + CodePoint cp = (ch & 0x7) << 18; + if (!getChar(ch)) return; cp += (ch & 0x3f) << 12; - if (!getChar(ch)) continue; + if (!getChar(ch)) return; cp += (ch & 0x3f) << 6; - if (!getChar(ch)) continue; + if (!getChar(ch)) return; cp += ch & 0x3f; + writeOutput(cp); } break; default: assert(false); } - f(cp); } - return f; } -template <class String> -class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void> -{ -public: - explicit AppendStringIterator (String& x) : str(&x) {} - AppendStringIterator& operator= (typename String::value_type value) { *str += value; return *this; } - AppendStringIterator& operator* () { return *this; } - AppendStringIterator& operator++ () { return *this; } - AppendStringIterator operator++ (int) { return *this; } -private: - String* str; -}; - - template <class WideString, class CharString> inline WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16 wchar_t { WideString output; utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); }); + [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); }); return output; } @@ -273,7 +254,7 @@ CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf { CharString output; utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); }); + [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); return output; } @@ -283,7 +264,7 @@ CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert ut { CharString output; std::for_each(strBegin(str), strBegin(str) + strLength(str), - [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); }); + [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); }); return output; } } @@ -292,8 +273,8 @@ CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert ut template <class WideString, class CharString> inline WideString utf8ToWide(const CharString& str) { - assert_static((IsSameType<typename GetCharType<CharString>::Result, char >::result)); - assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result)); + assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value)); + assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value)); return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>()); } @@ -302,8 +283,8 @@ WideString utf8ToWide(const CharString& str) template <class CharString, class WideString> inline CharString wideToUtf8(const WideString& str) { - assert_static((IsSameType<typename GetCharType<CharString>::Result, char >::result)); - assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result)); + assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value)); + assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value)); return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>()); } @@ -326,8 +307,8 @@ template <class TargetString, class SourceString> inline TargetString utf8CvrtTo(const SourceString& str) { return utf8CvrtTo<TargetString>(str, - typename GetCharType<SourceString>::Result(), - typename GetCharType<TargetString>::Result()); + typename GetCharType<SourceString>::Type(), + typename GetCharType<TargetString>::Type()); } } |