summaryrefslogtreecommitdiff
path: root/shared/string_utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'shared/string_utf8.h')
-rw-r--r--shared/string_utf8.h335
1 files changed, 0 insertions, 335 deletions
diff --git a/shared/string_utf8.h b/shared/string_utf8.h
deleted file mode 100644
index 8c920752..00000000
--- a/shared/string_utf8.h
+++ /dev/null
@@ -1,335 +0,0 @@
-// **************************************************************************
-// * This file is part of the zenXML project. It is distributed under the *
-// * Boost Software License, Version 1.0. See accompanying file *
-// * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt. *
-// * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de) *
-// **************************************************************************
-
-#ifndef STRING_UTF8_HEADER_01832479146991573473545
-#define STRING_UTF8_HEADER_01832479146991573473545
-
-#include <iterator>
-#include "loki/TypeManip.h"
-#include "string_tools.h"
-#include "assert_static.h"
-
-namespace zen
-{
-//convert any(!) "string-like" object into target string by applying a UTF8 conversion (only if necessary!)
-template <class TargetString, class SourceString> TargetString utf8CvrtTo(const SourceString& str);
-
-//convert wide to utf8 string; example: std::string tmp = toUtf8<std::string>(L"abc");
-template <class CharString, class WideString>
-CharString wideToUtf8(const WideString& str);
-
-//convert utf8 string to wide; example: std::wstring tmp = utf8To<std::wstring>("abc");
-template <class WideString, class CharString>
-WideString utf8ToWide(const CharString& str);
-
-const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//----------------------- implementation ----------------------------------
-namespace implementation
-{
-typedef unsigned int CodePoint;
-
-const CodePoint CODE_POINT_MAX = 0x10ffff;
-
-const CodePoint HIGH_SURROGATE = 0xd800;
-const CodePoint HIGH_SURROGATE_MAX = 0xdbff;
-
-const CodePoint LOW_SURROGATE = 0xdc00;
-const CodePoint LOW_SURROGATE_MAX = 0xdfff;
-
-
-template <class OutputIterator> inline
-OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16
-{
- typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
-
- assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
- assert(cp <= CODE_POINT_MAX);
-
- if (cp < 0x10000)
- *result++ = static_cast<Char16>(cp);
- else
- {
- cp -= 0x10000;
- *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE);
- *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE);
- }
- return result;
-}
-
-
-template <class CharIterator, class Function> inline
-Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
-{
- assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2);
- typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
-
- for ( ; first != last; ++first)
- {
- CodePoint cp = static_cast<Char16>(*first);
- if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
- {
- if (++first == last)
- {
- assert(false); //low surrogate expected
- break;
- }
- assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
- cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
- }
- else
- assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
-
- f(cp);
- }
- return f;
-}
-
-
-template <class OutputIterator> inline
-OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8
-{
- typedef unsigned char Char8;
-
- assert(cp <= CODE_POINT_MAX);
-
- if (cp < 0x80)
- *result++ = static_cast<Char8>(cp);
- else if (cp < 0x800)
- {
- *result++ = static_cast<Char8>((cp >> 6 ) | 0xc0);
- *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80);
- }
- else if (cp < 0x10000)
- {
- *result++ = static_cast<Char8>((cp >> 12 ) | 0xe0);
- *result++ = static_cast<Char8>(((cp >> 6) & 0x3f ) | 0x80);
- *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80);
- }
- else
- {
- *result++ = static_cast<Char8>((cp >> 18 ) | 0xf0);
- *result++ = static_cast<Char8>(((cp >> 12) & 0x3f ) | 0x80);
- *result++ = static_cast<Char8>(((cp >> 6) & 0x3f ) | 0x80);
- *result++ = static_cast<Char8>((cp & 0x3f ) | 0x80);
- }
- return result;
-}
-
-
-inline
-size_t getUtf8Len(unsigned char ch)
-{
- if (ch < 0x80)
- return 1;
- if (ch >> 5 == 0x6)
- return 2;
- if (ch >> 4 == 0xe)
- return 3;
- if (ch >> 3 == 0x1e)
- return 4;
-
- assert(false); //no valid begin of UTF8 encoding
- return 1;
-}
-
-
-template <class CharIterator, class Function> inline
-Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
-{
- assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1);
- typedef unsigned char Char8;
-
- for ( ; first != last; ++first)
- {
- auto getChar = [&](Char8 & ch) -> bool
- {
- if (++first == last)
- {
- assert(false); //low surrogate expected
- return false;
- }
- ch = static_cast<Char8>(*first);
- assert(ch >> 6 == 0x2);
- return true;
- };
-
- CodePoint cp = static_cast<Char8>(*first);
- switch (getUtf8Len(static_cast<Char8>(cp)))
- {
- case 1:
- break;
- case 2:
- {
- cp = (cp & 0x1f) << 6;
- Char8 ch;
- if (!getChar(ch)) continue;
- cp += ch & 0x3f;
- }
- break;
- case 3:
- {
- cp = (cp & 0xf) << 12;
- Char8 ch;
- if (!getChar(ch)) continue;
- cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) continue;
- cp += ch & 0x3f;
-
- }
- break;
- case 4:
- {
- cp = (cp & 0x7) << 18;
- Char8 ch;
- if (!getChar(ch)) continue;
- cp += (ch & 0x3f) << 12;
- if (!getChar(ch)) continue;
- cp += (ch & 0x3f) << 6;
- if (!getChar(ch)) continue;
- cp += ch & 0x3f;
- }
- break;
- default:
- assert(false);
- }
- f(cp);
- }
- return f;
-}
-
-
-template <class String>
-class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void>
-{
-public:
- explicit AppendStringIterator (String& x) : str(&x) {}
- AppendStringIterator& operator= (typename String::value_type value) { *str += value; return *this; }
- AppendStringIterator& operator* () { return *this; }
- AppendStringIterator& operator++ () { return *this; }
- AppendStringIterator operator++ (int) { return *this; }
-private:
- String* str;
-};
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Loki::Int2Type<2>) //windows: convert utf8 to utf16 wchar_t
-{
- WideString output;
- utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); });
- return output;
-}
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str, Loki::Int2Type<4>) //other OS: convert utf8 to utf32 wchar_t
-{
- WideString output;
- utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { output += cp; });
- return output;
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Loki::Int2Type<2>) //windows: convert utf16-wchar_t to utf8
-{
- CharString output;
- utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
- return output;
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str, Loki::Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
-{
- CharString output;
- std::for_each(strBegin(str), strBegin(str) + strLength(str),
- [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
- return output;
-}
-}
-
-
-template <class WideString, class CharString> inline
-WideString utf8ToWide(const CharString& str)
-{
- assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char >::value));
- assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value));
-
- return implementation::utf8ToWide<WideString>(str, Loki::Int2Type<sizeof(wchar_t)>());
-}
-
-
-template <class CharString, class WideString> inline
-CharString wideToUtf8(const WideString& str)
-{
- assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char >::value));
- assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value));
-
- return implementation::wideToUtf8<CharString>(str, Loki::Int2Type<sizeof(wchar_t)>());
-}
-
-
-//-------------------------------------------------------------------------------------------
-template <class TargetString, class SourceString> inline
-TargetString utf8CvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utf8CvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utf8CvrtTo(const SourceString& str, char, char) { return cvrtString<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utf8CvrtTo(const SourceString& str, wchar_t, wchar_t) { return cvrtString<TargetString>(str); }
-
-template <class TargetString, class SourceString> inline
-TargetString utf8CvrtTo(const SourceString& str)
-{
- return utf8CvrtTo<TargetString>(str, typename StringTraits<SourceString>::CharType(),
- typename StringTraits<TargetString>::CharType());
-}
-}
-
-#endif //STRING_UTF8_HEADER_01832479146991573473545
bgstack15