summaryrefslogtreecommitdiff
path: root/zen/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'zen/utf8.h')
-rw-r--r--zen/utf8.h129
1 files changed, 55 insertions, 74 deletions
diff --git a/zen/utf8.h b/zen/utf8.h
index 3d97ca7a..242b729f 100644
--- a/zen/utf8.h
+++ b/zen/utf8.h
@@ -8,13 +8,15 @@
#ifndef STRING_UTF8_HEADER_01832479146991573473545
#define STRING_UTF8_HEADER_01832479146991573473545
+#include <cstdint>
#include <iterator>
#include "string_tools.h" //copyStringTo
namespace zen
{
//convert any(!) "string-like" object into target string by applying a UTF8 conversion (but only if necessary!)
-template <class TargetString, class SourceString> TargetString utf8CvrtTo(const SourceString& str);
+template <class TargetString, class SourceString>
+TargetString utf8CvrtTo(const SourceString& str);
//convert wide to utf8 string; example: std::string tmp = toUtf8<std::string>(L"abc");
template <class CharString, class WideString>
@@ -62,7 +64,9 @@ const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
//----------------------- implementation ----------------------------------
namespace implementation
{
-typedef unsigned int CodePoint; //must be at least four bytes
+typedef std::uint_fast32_t CodePoint; //must be at least four bytes
+typedef std::uint_fast16_t Char16; //we need an unsigned type
+typedef unsigned char Char8;
const CodePoint CODE_POINT_MAX = 0x10ffff;
@@ -73,31 +77,28 @@ const CodePoint LOW_SURROGATE = 0xdc00;
const CodePoint LOW_SURROGATE_MAX = 0xdfff;
-template <class OutputIterator> inline
-OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16
+template <class Function> inline
+void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
{
- typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
-
+ //http://en.wikipedia.org/wiki/UTF-16
assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
assert(cp <= CODE_POINT_MAX);
if (cp < 0x10000)
- *result++ = static_cast<Char16>(cp);
+ writeOutput(static_cast<Char16>(cp));
else
{
cp -= 0x10000;
- *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE);
- *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE);
+ writeOutput(static_cast<Char16>((cp >> 10) + HIGH_SURROGATE));
+ writeOutput(static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE));
}
- return result;
}
template <class CharIterator, class Function> inline
-Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
+void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
{
assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2);
- typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
for ( ; first != last; ++first)
{
@@ -107,7 +108,7 @@ Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f
if (++first == last)
{
assert(false); //low surrogate expected
- break;
+ return;
}
assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
@@ -115,40 +116,37 @@ Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f
else
assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
- f(cp);
+ writeOutput(cp);
}
- return f;
}
-template <class OutputIterator> inline
-OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8
+template <class Function> inline
+void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
{
- typedef unsigned char Char8;
-
- assert(cp <= CODE_POINT_MAX);
+ //http://en.wikipedia.org/wiki/UTF-8
if (cp < 0x80)
- *result++ = static_cast<Char8>(cp);
+ writeOutput(static_cast<Char8>(cp));
else if (cp < 0x800)
{
- *result++ = static_cast<Char8>((cp >> 6 ) | 0xc0);
- *result++ = static_cast<Char8>((cp & 0x3f) | 0x80);
+ writeOutput(static_cast<Char8>((cp >> 6 ) | 0xc0));
+ writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80));
}
else if (cp < 0x10000)
{
- *result++ = static_cast<Char8>((cp >> 12 ) | 0xe0);
- *result++ = static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80);
- *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80);
+ writeOutput(static_cast<Char8>((cp >> 12 ) | 0xe0));
+ writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
+ writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80));
}
else
{
- *result++ = static_cast<Char8>((cp >> 18 ) | 0xf0);
- *result++ = static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80);
- *result++ = static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80);
- *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80);
+ assert(cp <= CODE_POINT_MAX);
+ writeOutput(static_cast<Char8>((cp >> 18 ) | 0xf0));
+ writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
+ writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
+ writeOutput(static_cast<Char8>((cp & 0x3f ) | 0x80));
}
- return result;
}
@@ -170,14 +168,13 @@ size_t getUtf8Len(unsigned char ch)
template <class CharIterator, class Function> inline
-Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
+void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
{
assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1);
- typedef unsigned char Char8;
for ( ; first != last; ++first)
{
- auto getChar = [&](Char8 & ch) -> bool
+ auto getChar = [&](Char8& ch) -> bool
{
if (++first == last)
{
@@ -189,71 +186,55 @@ Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f
return true;
};
- CodePoint cp = static_cast<Char8>(*first);
- switch (getUtf8Len(static_cast<Char8>(cp)))
+ Char8 ch = static_cast<Char8>(*first);
+ switch (getUtf8Len(ch))
{
case 1:
+ writeOutput(ch);
break;
case 2:
{
- cp = (cp & 0x1f) << 6;
- Char8 ch;
- if (!getChar(ch)) continue;
+ CodePoint cp = (ch & 0x1f) << 6;
+ if (!getChar(ch)) return;
cp += ch & 0x3f;
+ writeOutput(cp);
}
break;
case 3:
{
- cp = (cp & 0xf) << 12;
- Char8 ch;
- if (!getChar(ch)) continue;
+ CodePoint cp = (ch & 0xf) << 12;
+ if (!getChar(ch)) return;
cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) continue;
+ if (!getChar(ch)) return;
cp += ch & 0x3f;
-
+ writeOutput(cp);
}
break;
case 4:
{
- cp = (cp & 0x7) << 18;
- Char8 ch;
- if (!getChar(ch)) continue;
+ CodePoint cp = (ch & 0x7) << 18;
+ if (!getChar(ch)) return;
cp += (ch & 0x3f) << 12;
- if (!getChar(ch)) continue;
+ if (!getChar(ch)) return;
cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) continue;
+ if (!getChar(ch)) return;
cp += ch & 0x3f;
+ writeOutput(cp);
}
break;
default:
assert(false);
}
- f(cp);
}
- return f;
}
-template <class String>
-class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void>
-{
-public:
- explicit AppendStringIterator (String& x) : str(&x) {}
- AppendStringIterator& operator= (typename String::value_type value) { *str += value; return *this; }
- AppendStringIterator& operator* () { return *this; }
- AppendStringIterator& operator++ () { return *this; }
- AppendStringIterator operator++ (int) { return *this; }
-private:
- String* str;
-};
-
-
template <class WideString, class CharString> inline
WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16 wchar_t
{
WideString output;
utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); });
+ [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); });
return output;
}
@@ -273,7 +254,7 @@ CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf
{
CharString output;
utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
+ [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
return output;
}
@@ -283,7 +264,7 @@ CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert ut
{
CharString output;
std::for_each(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
+ [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
return output;
}
}
@@ -292,8 +273,8 @@ CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert ut
template <class WideString, class CharString> inline
WideString utf8ToWide(const CharString& str)
{
- assert_static((IsSameType<typename GetCharType<CharString>::Result, char >::result));
- assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result));
+ assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value));
+ assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value));
return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
}
@@ -302,8 +283,8 @@ WideString utf8ToWide(const CharString& str)
template <class CharString, class WideString> inline
CharString wideToUtf8(const WideString& str)
{
- assert_static((IsSameType<typename GetCharType<CharString>::Result, char >::result));
- assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result));
+ assert_static((IsSameType<typename GetCharType<CharString>::Type, char >::value));
+ assert_static((IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value));
return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
}
@@ -326,8 +307,8 @@ template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str)
{
return utf8CvrtTo<TargetString>(str,
- typename GetCharType<SourceString>::Result(),
- typename GetCharType<TargetString>::Result());
+ typename GetCharType<SourceString>::Type(),
+ typename GetCharType<TargetString>::Type());
}
}
bgstack15