// **************************************************************************
// * This file is part of the zenXML project. It is distributed under the   *
// * Boost Software License, Version 1.0. See accompanying file             *
// * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt.       *
// * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de)                         *
// **************************************************************************

#ifndef STRING_UTF8_HEADER_01832479146991573473545
#define STRING_UTF8_HEADER_01832479146991573473545

#include <iterator>
#include "string_tools.h" //copyStringTo

namespace zen
{
//convert any(!) "string-like" object into target string by applying a UTF8 conversion (but only if necessary!)
template <class TargetString, class SourceString> TargetString utf8CvrtTo(const SourceString& str);

//convert wide to utf8 string; example: std::string tmp = toUtf8<std::string>(L"abc");
template <class CharString, class WideString>
CharString wideToUtf8(const WideString& str);

//convert utf8 string to wide; example: std::wstring tmp = utf8To<std::wstring>("abc");
template <class WideString, class CharString>
WideString utf8ToWide(const CharString& str);

const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";


































//----------------------- implementation ----------------------------------
namespace implementation
{
typedef unsigned int CodePoint; //must be at least four bytes

const CodePoint CODE_POINT_MAX     = 0x10ffff;

const CodePoint HIGH_SURROGATE     = 0xd800;
const CodePoint HIGH_SURROGATE_MAX = 0xdbff;

const CodePoint LOW_SURROGATE      = 0xdc00;
const CodePoint LOW_SURROGATE_MAX  = 0xdfff;


template <class OutputIterator> inline
OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16
{
    typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type

    assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
    assert(cp <= CODE_POINT_MAX);

    if (cp < 0x10000)
        *result++ = static_cast<Char16>(cp);
    else
    {
        cp -= 0x10000;
        *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE);
        *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE);
    }
    return result;
}


template <class CharIterator, class Function> inline
Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
{
    assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2);
    typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type

    for ( ; first != last; ++first)
    {
        CodePoint cp = static_cast<Char16>(*first);
        if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
        {
            if (++first == last)
            {
                assert(false); //low surrogate expected
                break;
            }
            assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
            cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
        }
        else
            assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected

        f(cp);
    }
    return f;
}


template <class OutputIterator> inline
OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8
{
    typedef unsigned char Char8;

    assert(cp <= CODE_POINT_MAX);

    if (cp < 0x80)
        *result++ = static_cast<Char8>(cp);
    else if (cp < 0x800)
    {
        *result++ = static_cast<Char8>((cp >> 6  ) | 0xc0);
        *result++ = static_cast<Char8>((cp & 0x3f) | 0x80);
    }
    else if (cp < 0x10000)
    {
        *result++ = static_cast<Char8>((cp >> 12        ) | 0xe0);
        *result++ = static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80);
        *result++ = static_cast<Char8>((cp & 0x3f       ) | 0x80);
    }
    else
    {
        *result++ = static_cast<Char8>((cp >> 18         ) | 0xf0);
        *result++ = static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80);
        *result++ = static_cast<Char8>(((cp >> 6)  & 0x3f) | 0x80);
        *result++ = static_cast<Char8>((cp & 0x3f        ) | 0x80);
    }
    return result;
}


inline
size_t getUtf8Len(unsigned char ch)
{
    if (ch < 0x80)
        return 1;
    if (ch >> 5 == 0x6)
        return 2;
    if (ch >> 4 == 0xe)
        return 3;
    if (ch >> 3 == 0x1e)
        return 4;

    assert(false); //no valid begin of UTF8 encoding
    return 1;
}


template <class CharIterator, class Function> inline
Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
{
    assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1);
    typedef unsigned char Char8;

    for ( ; first != last; ++first)
    {
        auto getChar = [&](Char8 & ch) -> bool
        {
            if (++first == last)
            {
                assert(false); //low surrogate expected
                return false;
            }
            ch = static_cast<Char8>(*first);
            assert(ch >> 6 == 0x2);
            return true;
        };

        CodePoint cp = static_cast<Char8>(*first);
        switch (getUtf8Len(static_cast<Char8>(cp)))
        {
            case 1:
                break;
            case 2:
            {
                cp = (cp & 0x1f) << 6;
                Char8 ch;
                if (!getChar(ch)) continue;
                cp += ch & 0x3f;
            }
            break;
            case 3:
            {
                cp = (cp & 0xf) << 12;
                Char8 ch;
                if (!getChar(ch)) continue;
                cp += (ch & 0x3f) << 6;
                if (!getChar(ch)) continue;
                cp += ch & 0x3f;

            }
            break;
            case 4:
            {
                cp = (cp & 0x7) << 18;
                Char8 ch;
                if (!getChar(ch)) continue;
                cp += (ch & 0x3f) << 12;
                if (!getChar(ch)) continue;
                cp += (ch & 0x3f) << 6;
                if (!getChar(ch)) continue;
                cp += ch & 0x3f;
            }
            break;
            default:
                assert(false);
        }
        f(cp);
    }
    return f;
}


template <class String>
class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void>
{
public:
    explicit AppendStringIterator (String& x) : str(&x) {}
    AppendStringIterator& operator= (typename String::value_type value) { *str += value; return *this; }
    AppendStringIterator& operator*  ()    { return *this; }
    AppendStringIterator& operator++ ()    { return *this; }
    AppendStringIterator  operator++ (int) { return *this; }
private:
    String* str;
};


template <class WideString, class CharString> inline
WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16 wchar_t
{
    WideString output;
    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
    [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); });
    return output;
}


template <class WideString, class CharString> inline
WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32 wchar_t
{
    WideString output;
    utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
    [&](CodePoint cp) { output += static_cast<wchar_t>(cp); });
    return output;
}


template <class CharString, class WideString> inline
CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8
{
    CharString output;
    utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
    [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
    return output;
}


template <class CharString, class WideString> inline
CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
{
    CharString output;
    std::for_each(strBegin(str), strBegin(str) + strLength(str),
    [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
    return output;
}
}


template <class WideString, class CharString> inline
WideString utf8ToWide(const CharString& str)
{
    assert_static((IsSameType<typename GetCharType<CharString>::Result, char   >::result));
    assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result));

    return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
}


template <class CharString, class WideString> inline
CharString wideToUtf8(const WideString& str)
{
    assert_static((IsSameType<typename GetCharType<CharString>::Result, char   >::result));
    assert_static((IsSameType<typename GetCharType<WideString>::Result, wchar_t>::result));

    return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
}


//-------------------------------------------------------------------------------------------
template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }

template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }

template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); }

template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); }

template <class TargetString, class SourceString> inline
TargetString utf8CvrtTo(const SourceString& str)
{
    return utf8CvrtTo<TargetString>(str,
                                    typename GetCharType<SourceString>::Result(),
                                    typename GetCharType<TargetString>::Result());
}
}

#endif //STRING_UTF8_HEADER_01832479146991573473545